|
|
@@ -1,173 +1,130 @@
|
|
|
package top.lvzhiqiang.util;
|
|
|
|
|
|
+import org.jsoup.Connection;
|
|
|
import org.jsoup.Jsoup;
|
|
|
import org.jsoup.nodes.Document;
|
|
|
-import org.jsoup.nodes.Element;
|
|
|
-import org.jsoup.select.Elements;
|
|
|
|
|
|
-import java.io.File;
|
|
|
-import java.util.ArrayList;
|
|
|
-import java.util.List;
|
|
|
+import javax.net.ssl.*;
|
|
|
+import java.net.Proxy;
|
|
|
+import java.security.SecureRandom;
|
|
|
+import java.security.cert.CertificateException;
|
|
|
+import java.security.cert.X509Certificate;
|
|
|
+import java.util.Map;
|
|
|
+import java.util.Random;
|
|
|
|
|
|
public class JsoupUtil {
|
|
|
+ private static int TIMEOUT_CONNECTION = 60000;
|
|
|
+ public static String HTTP_GET = "GET";
|
|
|
+ public static String HTTP_POST = "POST";
|
|
|
+
|
|
|
+ private static Connection getConnection(String url, Proxy proxy) {
|
|
|
+ return Jsoup.connect(url)
|
|
|
+ .timeout(TIMEOUT_CONNECTION)
|
|
|
+ .proxy(proxy)
|
|
|
+ .userAgent(getUserAgent())
|
|
|
+ .followRedirects(true)
|
|
|
+ .ignoreContentType(true);
|
|
|
+ }
|
|
|
|
|
|
- private static Document doc = null;
|
|
|
- private static Elements eles = null;
|
|
|
-
|
|
|
-
|
|
|
- /**
|
|
|
- * 获取xml文件(file格式)
|
|
|
- *
|
|
|
- * @param file
|
|
|
- */
|
|
|
- public static Document setXmlFile(File file) {
|
|
|
- try {
|
|
|
- doc = Jsoup.parse(file, "UTF-8");
|
|
|
- } catch (Exception e) {
|
|
|
- e.printStackTrace();
|
|
|
+ public static Document requestDocument(String url, String httpMethod, Map<String, String> cookies, Map<String, String> headers, Proxy proxy, Map<String, String> data) throws Exception {
|
|
|
+ Connection connection = getConnection(url, proxy);
|
|
|
+ if (data != null && data.size() > 0) {
|
|
|
+ connection.data(data);
|
|
|
+ }
|
|
|
+ if (cookies != null) {
|
|
|
+ connection.cookies(cookies);
|
|
|
}
|
|
|
- return doc;
|
|
|
+ if (headers != null) {
|
|
|
+ connection.headers(headers);
|
|
|
+ }
|
|
|
+ Document resultDocument = HTTP_POST.equalsIgnoreCase(httpMethod) ? connection.post() : connection.get();
|
|
|
+ return resultDocument;
|
|
|
}
|
|
|
|
|
|
-
|
|
|
- /**
|
|
|
- * 获取xml文件(绝对路径)
|
|
|
- *
|
|
|
- * @param path
|
|
|
- */
|
|
|
- public static Document setXmlFile(String path) {
|
|
|
- try {
|
|
|
- File file = new File(path);
|
|
|
- doc = Jsoup.parse(file, "UTF-8");
|
|
|
- } catch (Exception e) {
|
|
|
- e.printStackTrace();
|
|
|
- }
|
|
|
- return doc;
|
|
|
+ public static Document requestDocument(String url, String httpMethod, Proxy proxy, Map<String, String> data) throws Exception {
|
|
|
+ return requestDocument(url, httpMethod, null, null, proxy, data);
|
|
|
}
|
|
|
|
|
|
+ public static Document requestDocument(String url, String httpMethod, Proxy proxy, Map<String, String> headers, Map<String, String> data) throws Exception {
|
|
|
+ return requestDocument(url, httpMethod, proxy, headers, data);
|
|
|
+ }
|
|
|
|
|
|
- /**
|
|
|
- * 根据拼接节点获取元素集合
|
|
|
- *
|
|
|
- * @param nodeQuery
|
|
|
- * @return
|
|
|
- */
|
|
|
- public static Elements getEles(String nodeQuery) {
|
|
|
- try {
|
|
|
- eles = doc.select(nodeQuery);
|
|
|
- } catch (Exception e) {
|
|
|
- e.printStackTrace();
|
|
|
+ public static Connection.Response requestBody(String url, String httpMethod, Map<String, String> cookies, Map<String, String> headers, Proxy proxy, Map<String, String> data) throws Exception {
|
|
|
+ Connection connection = getConnection(url, proxy);
|
|
|
+ if (data != null && data.size() > 0) {
|
|
|
+ connection.data(data);
|
|
|
}
|
|
|
-
|
|
|
- return eles;
|
|
|
+ if (cookies != null) {
|
|
|
+ connection.cookies(cookies);
|
|
|
+ }
|
|
|
+ if (headers != null) {
|
|
|
+ connection.headers(headers);
|
|
|
+ }
|
|
|
+ connection.method(HTTP_POST.equalsIgnoreCase(httpMethod) ? Connection.Method.POST : Connection.Method.GET);
|
|
|
+ Connection.Response res = connection.execute();
|
|
|
+ return res;
|
|
|
}
|
|
|
|
|
|
+ public static Connection.Response requestBody(String url, String httpMethod, Proxy proxy, Map<String, String> data) throws Exception {
|
|
|
+ return requestBody(url, httpMethod, null, null, proxy, data);
|
|
|
+ }
|
|
|
|
|
|
- public static void main(String[] args) {
|
|
|
-
|
|
|
- /*String http="http://zizhan.mot.gov.cn/sj/kejs/kejifzh_kjs/";
|
|
|
- List<String> newsLink =new ArrayList<String>();
|
|
|
- try {
|
|
|
- //获取所需要的所有页面链接
|
|
|
- Document doc=Jsoup.connect(http).get();
|
|
|
- Elements eles=doc.select("div.main_cont1 > ul > li > a");
|
|
|
- for (Element element : eles) {
|
|
|
-
|
|
|
- String href=element.attr("href").toString();
|
|
|
- href=href.substring(8);
|
|
|
- newsLink.add("http://zizhan.mot.gov.cn"+href);
|
|
|
- }
|
|
|
-
|
|
|
- //从每个页面中获取所需字段
|
|
|
- for (String str : newsLink) {
|
|
|
- try {
|
|
|
- doc=Jsoup.connect(str).get();
|
|
|
- String title=doc.select("div#cont_detail > div.docTitleCls").text();
|
|
|
- String content=doc.select("div#cont_detail > div").get(1).html();
|
|
|
- if(title==null || title.equals("") || content==null || content.equals(""))
|
|
|
- continue;
|
|
|
- //获取来源和时间
|
|
|
- Elements ele=doc.select("div.continfo>table>tbody>tr").get(2).select("td");
|
|
|
- String createDate=ele.get(0).text().replaceAll("发文日期:","");
|
|
|
-
|
|
|
- Elements elesource=doc.select("div.continfo>table>tbody>tr").get(1).select("td");
|
|
|
- String source=elesource.get(1).text().replaceAll("发布机构:","");
|
|
|
- System.out.println(createDate);
|
|
|
- System.out.println(title);
|
|
|
- System.out.println(source);
|
|
|
- System.out.println(content);
|
|
|
- System.out.println("=============================================");
|
|
|
-
|
|
|
- } catch (Exception e) {
|
|
|
- continue;
|
|
|
- }
|
|
|
-
|
|
|
-
|
|
|
- }
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
- } catch (Exception e) {
|
|
|
- e.printStackTrace();
|
|
|
- }*/
|
|
|
+ public static Connection.Response requestBody(String url, String httpMethod, Proxy proxy, Map<String, String> headers, Map<String, String> data) throws Exception {
|
|
|
+ return requestBody(url, httpMethod, null, headers, proxy, data);
|
|
|
+ }
|
|
|
|
|
|
+ private static String getUserAgent() {
|
|
|
+ Random r = new Random();
|
|
|
+ String[] ua = {"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
|
|
|
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36 OPR/37.0.2178.32",
|
|
|
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
|
|
|
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
|
|
|
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586",
|
|
|
+ "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
|
|
|
+ "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
|
|
|
+ "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",
|
|
|
+ "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)",
|
|
|
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.3 Safari/537.36",
|
|
|
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.277.400 QQBrowser/9.4.7658.400",
|
|
|
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 UBrowser/5.6.12150.8 Safari/537.36",
|
|
|
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0",
|
|
|
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 TheWorld 7",
|
|
|
+ "Mozilla/5.0 (Windows NT 6.1; W…) Gecko/20100101 Firefox/60.0",
|
|
|
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"};
|
|
|
+ int i = r.nextInt(15);
|
|
|
+ return ua[i];
|
|
|
+ }
|
|
|
|
|
|
+ /**
|
|
|
+ * 信任任何站点
|
|
|
+ */
|
|
|
+ private static void trustEveryone() {
|
|
|
try {
|
|
|
+ HttpsURLConnection.setDefaultHostnameVerifier(new HostnameVerifier() {
|
|
|
+ @Override
|
|
|
+ public boolean verify(String hostname, SSLSession session) {
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ });
|
|
|
|
|
|
- List<String> newsLink = new ArrayList<String>();
|
|
|
- Document doc = Jsoup.connect("http://www.cmzz100.com/cn/G100/toutiao.html").get();
|
|
|
-
|
|
|
- //Elements eles=doc.select("table.border-style>tbody>tr>td>table>tbody>tr").get(1).select("table>tbody>tr>td>table>tbody>tr").get(1).select("td");
|
|
|
- Elements eles = doc.select("div.title>a");
|
|
|
-
|
|
|
-
|
|
|
- for (Element element : eles) {
|
|
|
- newsLink.add("http://www.cmzz100.com" + element.attr("href"));
|
|
|
- }
|
|
|
-
|
|
|
-
|
|
|
- //从每个页面中获取所需字段
|
|
|
- for (String str : newsLink) {
|
|
|
- try {
|
|
|
- doc = Jsoup.connect(str).get();
|
|
|
- String title = doc.select("div.title").text();
|
|
|
- Elements ele1 = doc.select("div.info");
|
|
|
- for (Element el : ele1) {
|
|
|
- if (!doc.select("div.info").get(0).select("a>img").equals("")) {
|
|
|
- Elements ele2 = doc.select("div.info").get(0).select("a>img");
|
|
|
- ele2.attr("src", "http://www.cmzz100.com" + ele2.attr("src"));
|
|
|
- }
|
|
|
- Elements els3 = doc.select("div.info p>img[src]");
|
|
|
- els3.attr("src", "http://www.cmzz100.com" + els3.attr("src"));
|
|
|
- // System.out.println(els3.toString());
|
|
|
- // System.out.println("=============================================");
|
|
|
- // el.attr("src","http://www.cmzz100.com"+el.attr("src"));
|
|
|
- // System.out.print(el.toString());
|
|
|
- }
|
|
|
- String content = doc.select("div.info").html();
|
|
|
-
|
|
|
- System.out.print(ele1);
|
|
|
- System.out.println("=============================================");
|
|
|
- if (title == null || title.equals("") || content == null || content.equals(""))
|
|
|
- continue;
|
|
|
- //获取来源和时间
|
|
|
- Elements ele = doc.select("div.datetime");
|
|
|
- String createDate = ele.text().substring(0, ele.text().indexOf("|")).replaceAll("发表:", "");
|
|
|
-
|
|
|
- //System.out.println(createDate);
|
|
|
-
|
|
|
-
|
|
|
- //System.out.println(title);
|
|
|
- //System.out.println(source);
|
|
|
- //System.out.println(content);
|
|
|
+ SSLContext context = SSLContext.getInstance("TLS");
|
|
|
+ context.init(null, new X509TrustManager[]{new X509TrustManager() {
|
|
|
+ @Override
|
|
|
+ public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
|
|
|
+ }
|
|
|
|
|
|
+ @Override
|
|
|
+ public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
|
|
|
+ }
|
|
|
|
|
|
- } catch (Exception e) {
|
|
|
- continue;
|
|
|
+ @Override
|
|
|
+ public X509Certificate[] getAcceptedIssuers() {
|
|
|
+ return new X509Certificate[0];
|
|
|
}
|
|
|
- }
|
|
|
+ }}, new SecureRandom());
|
|
|
+ HttpsURLConnection.setDefaultSSLSocketFactory(context.getSocketFactory());
|
|
|
} catch (Exception e) {
|
|
|
- // TODO: handle exception
|
|
|
+ e.printStackTrace();
|
|
|
}
|
|
|
}
|
|
|
}
|