Test4Javbus.java 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. import com.alibaba.fastjson.JSONObject;
  2. import org.apache.commons.lang3.StringUtils;
  3. import org.jsoup.Connection;
  4. import org.jsoup.Jsoup;
  5. import org.jsoup.nodes.Document;
  6. import org.jsoup.nodes.Element;
  7. import org.jsoup.select.Elements;
  8. import top.lvzhiqiang.entity.CrawlerJavbusProfile;
  9. import top.lvzhiqiang.util.DateUtils;
  10. import top.lvzhiqiang.util.JsoupUtil;
  11. import java.io.*;
  12. import java.net.HttpURLConnection;
  13. import java.net.InetSocketAddress;
  14. import java.net.Proxy;
  15. import java.net.URL;
  16. import java.time.LocalDateTime;
  17. import java.util.*;
  18. public class Test4Javbus {
  19. public static void main(String[] args) throws Exception {
  20. //setupOne();
  21. setupTwo();
  22. }
  23. private static void setupTwo() throws Exception {
  24. File file = new File("d:\\zhiqiang.lv\\Desktop", "1.html");
  25. // file = new File("C:\\Users\\l1024v\\Desktop", "1.html");
  26. Document document = Jsoup.parse(file, "UTF-8");
  27. String avatarUrl = document.select("div.avt").select("img").attr("src");
  28. String[] mbn0Arr = document.select("div.u_profile").select("div.cl").get(0).select("h2.mbn").get(0).text().replace("(", "").replace(")", "").split("UID:");
  29. String nickName = mbn0Arr[0].trim();
  30. String uid = mbn0Arr[1].trim();
  31. String emailStatus = document.select("div.u_profile").select("div.cl").get(0).select("ul").first().text().replace("郵箱狀態", "").trim();
  32. Elements signEles = document.select("div.u_profile").select("div.cl").get(0).select("ul").get(1).select("li:contains(個人簽名)");
  33. String signStr = "";
  34. ArrayList<String> signImgList = new ArrayList<>();
  35. if (signEles.size() > 0) {
  36. signStr = signEles.first().select("table").text();
  37. Elements signImgEles = signEles.first().select("table").select("img");
  38. for (Element signImgEle : signImgEles) {
  39. signImgList.add(signImgEle.attr("src"));
  40. }
  41. }
  42. String friendNum = document.select("div.u_profile").select("div.cl").get(0).select("ul").get(2)
  43. .select("a").get(0).text().replace("好友數", "").trim();
  44. String replyNum = document.select("div.u_profile").select("div.cl").get(0).select("ul").get(2)
  45. .select("a").get(1).text().replace("回帖數", "").trim();
  46. String threadNum = document.select("div.u_profile").select("div.cl").get(0).select("ul").get(2)
  47. .select("a").get(2).text().replace("主題數", "").trim();
  48. String userGroup = document.select("div.u_profile").select("div.cl").get(1).select("ul").get(0)
  49. .select("a").text();
  50. String onlineTime = document.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
  51. .select("li:contains(在線時間)").text().replace("在線時間", "").replace("小時", "").trim();
  52. String registrationTime = document.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
  53. .select("li:contains(註冊時間)").text().replace("註冊時間", "").trim();
  54. String lastVisit = document.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
  55. .select("li:contains(最後訪問)").text().replace("最後訪問", "").trim();
  56. String lastActivityTime = document.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
  57. .select("li:contains(上次活動時間)").text().replace("上次活動時間", "").trim();
  58. String lastPublishedTime = document.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
  59. .select("li:contains(上次發表時間)").text().replace("上次發表時間", "").trim();
  60. String timeZone = document.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
  61. .select("li:contains(所在時區)").text().replace("所在時區", "").trim();
  62. String usedSpace = document.select("div.u_profile").select("div.cl").get(2).select("ul").get(0)
  63. .select("li").get(0).text().replace("已用空間", "").replace("B", "").trim();
  64. String mileage = document.select("div.u_profile").select("div.cl").get(2).select("ul").get(0)
  65. .select("li").get(1).text().replace("里程", "").trim();
  66. String money = document.select("div.u_profile").select("div.cl").get(2).select("ul").get(0)
  67. .select("li").last().text().replace("金錢", "").trim();
  68. CrawlerJavbusProfile crawlerJavbusProfile = new CrawlerJavbusProfile();
  69. crawlerJavbusProfile.setUid(Long.valueOf(uid));
  70. crawlerJavbusProfile.setNickName(nickName);
  71. crawlerJavbusProfile.setEmailStatus(emailStatus);
  72. crawlerJavbusProfile.setFriendNum(Integer.valueOf(friendNum));
  73. crawlerJavbusProfile.setReplyNum(Integer.valueOf(replyNum));
  74. crawlerJavbusProfile.setThreadNum(Integer.valueOf(threadNum));
  75. crawlerJavbusProfile.setUserGroup(userGroup);
  76. crawlerJavbusProfile.setOnlineTime(top.lvzhiqiang.util.StringUtils.isNotEmpty(onlineTime) ? Integer.valueOf(onlineTime) : null);
  77. crawlerJavbusProfile.setRegistrationTime(top.lvzhiqiang.util.StringUtils.isNotEmpty(registrationTime) ? LocalDateTime.parse(registrationTime, DateUtils.dateTimeFormatter3) : null);
  78. crawlerJavbusProfile.setLastVisit(top.lvzhiqiang.util.StringUtils.isNotEmpty(lastVisit) ? LocalDateTime.parse(lastVisit, DateUtils.dateTimeFormatter3) : null);
  79. crawlerJavbusProfile.setLastActivityTime(top.lvzhiqiang.util.StringUtils.isNotEmpty(lastActivityTime) ? LocalDateTime.parse(lastActivityTime, DateUtils.dateTimeFormatter3) : null);
  80. crawlerJavbusProfile.setLastPublishedTime(top.lvzhiqiang.util.StringUtils.isNotEmpty(lastPublishedTime) ? LocalDateTime.parse(lastPublishedTime, DateUtils.dateTimeFormatter3) : null);
  81. crawlerJavbusProfile.setTimeZone(top.lvzhiqiang.util.StringUtils.isNotEmpty(timeZone) ? timeZone : null);
  82. crawlerJavbusProfile.setUsedSpace(Integer.valueOf(usedSpace));
  83. crawlerJavbusProfile.setMileage(Integer.valueOf(mileage));
  84. crawlerJavbusProfile.setMoney(Integer.valueOf(money));
  85. crawlerJavbusProfile.setAvatarUrl(avatarUrl);
  86. crawlerJavbusProfile.setSignStr(signStr);
  87. crawlerJavbusProfile.setSignImg(org.apache.commons.lang3.StringUtils.join(signImgList, ","));
  88. System.out.println(crawlerJavbusProfile);
  89. }
  90. private static void setupOne() throws Exception {
  91. // 代理及TOKEN设置
  92. Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("127.0.0.1", 1080));
  93. // 1 登陆获取cookies
  94. // 1.0 https://www.javbus.com/forum/forum.php
  95. Connection.Response forumResponse = JsoupUtil.requestBody("https://www.javbus.com/forum/forum.php", JsoupUtil.HTTP_GET, proxy, null);
  96. Map<String, String> forumCookies = forumResponse.cookies();
  97. System.out.println("forumCookies=" + forumCookies);
  98. // 1.1 https://www.javbus.com/forum/member.php
  99. String memberUrl = "https://www.javbus.com/forum/member.php";
  100. Map<String, String> params = new HashMap<>();
  101. params.put("mod", "logging");
  102. params.put("action", "login");
  103. params.put("referer", "");
  104. params.put("infloat", "yes");
  105. params.put("handlekey", "login");
  106. params.put("inajax", "1");
  107. params.put("ajaxtarget", "fwin_content_login");
  108. String memberHtmlStr = JsoupUtil.requestDocument(memberUrl, JsoupUtil.HTTP_GET, proxy, forumCookies, null, params).html().replace("<![CDATA[", "").replace("]]>", "");
  109. Document memberDocument = Jsoup.parse(memberHtmlStr);
  110. String key1 = memberDocument.select("input[type='password']").first().attr("id").split("_")[1];
  111. String key2 = memberDocument.select("span[id^='seccode']").first().attr("id").split("_")[1];
  112. String key3 = memberDocument.select("input[name='formhash']").first().val();
  113. // 1.2 https://www.javbus.com/forum/misc.php
  114. String miscUrl = "https://www.javbus.com/forum/misc.php";
  115. params.clear();
  116. params.put("mod", "seccode");
  117. params.put("action", "update");
  118. params.put("idhash", key2);
  119. params.put("modid", "member::logging");
  120. Document miscDocument = JsoupUtil.requestDocument(miscUrl, JsoupUtil.HTTP_GET, proxy, forumCookies, null, params);
  121. String imgVerifyUrl = "https://www.javbus.com/forum/" + miscDocument.select("img[onclick]").first().attr("src");
  122. System.out.println("imgVerifyUrl=" + imgVerifyUrl);
  123. // 1.3 get imgVerifyUrl
  124. Map<String, String> headerParams = new HashMap<>();
  125. headerParams.put("referer", "https://www.javbus.com/forum/forum.php");
  126. Connection.Response imgResponse = JsoupUtil.requestBody(imgVerifyUrl, JsoupUtil.HTTP_GET, proxy, forumCookies, headerParams, null);
  127. byte[] imgBytes = imgResponse.bodyAsBytes();
  128. Map<String, String> imgCookies = imgResponse.cookies();
  129. System.out.println("imgCookies=" + imgCookies);
  130. String cookieKey4Seccode = "";
  131. for (Map.Entry<String, String> imgCookie : imgCookies.entrySet()) {
  132. if (imgCookie.getKey().contains("seccode")) {
  133. cookieKey4Seccode = imgCookie.getKey();
  134. break;
  135. }
  136. }
  137. // 1.4 get imgVerifyNumber by BaiduOCR
  138. {
  139. //一次最多读取1k
  140. byte[] buffer = new byte[1024];
  141. //实际读取的长度
  142. int readLenghth;
  143. //创建的一个写出的缓冲流
  144. File savePathPrexFile = new File("d:\\zhiqiang.lv\\Desktop");
  145. BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(new FileOutputStream(new File(savePathPrexFile, "1.png")));
  146. //文件逐步写入本地
  147. BufferedInputStream bufferedInputStream = new BufferedInputStream(new ByteArrayInputStream(imgBytes));
  148. while ((readLenghth = bufferedInputStream.read(buffer, 0, 1024)) != -1) {//先读出来,保存在buffer数组中
  149. bufferedOutputStream.write(buffer, 0, readLenghth);//再从buffer中取出来保存到本地
  150. }
  151. //关闭缓冲流
  152. bufferedOutputStream.close();
  153. bufferedInputStream.close();
  154. }
  155. String ocrAccurateBasicUrl = "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic";
  156. String accessToken = getAuth("taQYPoO9deuODxEsltkGFyqG", "lm9laVGOO14cH3sfWvtwcL1GtEC8rwS9");
  157. headerParams.clear();
  158. headerParams.put("Content-Type", "application/x-www-form-urlencoded");
  159. params.clear();
  160. params.put("image", Base64.getEncoder().encodeToString(imgBytes));
  161. Connection.Response ocrResponse = JsoupUtil.requestBody(ocrAccurateBasicUrl.concat("?access_token=").concat(accessToken),
  162. JsoupUtil.HTTP_POST, Proxy.NO_PROXY, headerParams, params);
  163. JSONObject crAccurateBasicResult = JSONObject.parseObject(ocrResponse.body());
  164. String seccodeverify = crAccurateBasicResult.getJSONArray("words_result").getJSONObject(0).getString("words");
  165. // 1.5 https://www.javbus.com/forum/member.php
  166. StringBuffer sbParams = new StringBuffer();
  167. sbParams.append("?mod=logging&action=login&loginsubmit=yes&handlekey=login&loginhash=").append(key1).append("&inajax=1");
  168. headerParams.clear();
  169. headerParams.put("Content-Type", "application/x-www-form-urlencoded");
  170. params.clear();
  171. params.put("formhash", key3);
  172. params.put("referer", "https://www.javbus.com/forum/forum.php");
  173. params.put("loginfield", "username");
  174. params.put("username", "Tujide.lv");
  175. params.put("password", "Lzq920165830.");
  176. params.put("questionid", "0");
  177. params.put("answer", "");
  178. params.put("seccodehash", key2);
  179. params.put("seccodemodid", "member::logging");
  180. params.put("seccodeverify", seccodeverify);
  181. System.out.println(sbParams);
  182. System.out.println(params);
  183. if (cookieKey4Seccode != "") {
  184. forumCookies.put("existmag", "mag");
  185. forumCookies.put(cookieKey4Seccode, imgCookies.get(cookieKey4Seccode));
  186. System.out.println("forumCookies2=" + forumCookies);
  187. }
  188. Connection.Response loginResponse = JsoupUtil.requestBody(memberUrl.concat(sbParams.toString()), JsoupUtil.HTTP_POST, proxy, forumCookies, headerParams, params);
  189. Map<String, String> loginCookies = loginResponse.cookies();
  190. System.out.println("loginCookies=" + loginCookies);
  191. System.out.println(loginResponse.body());
  192. for (Map.Entry<String, String> loginCookie : loginCookies.entrySet()) {
  193. if (loginCookie.getKey().contains("ulastactivity")) {
  194. forumCookies.put(loginCookie.getKey(), loginCookie.getValue());
  195. } else if (loginCookie.getKey().contains("auth")) {
  196. forumCookies.put(loginCookie.getKey(), loginCookie.getValue());
  197. } else if (loginCookie.getKey().contains("lastcheckfeed")) {
  198. forumCookies.put(loginCookie.getKey(), loginCookie.getValue());
  199. } else if (loginCookie.getKey().contains("lip")) {
  200. forumCookies.put(loginCookie.getKey(), loginCookie.getValue());
  201. }
  202. }
  203. System.out.println("loginCookies2=" + forumCookies);
  204. // 2 获取个人资料
  205. Connection.Response memberInfoResponse = JsoupUtil.requestBody("https://www.javbus.com/forum/?355292", JsoupUtil.HTTP_GET, proxy, forumCookies, null, null);
  206. System.out.println("memberInfoCookies=" + memberInfoResponse.cookies());
  207. System.out.println(memberInfoResponse.body());
  208. }
  209. public static String getAuth(String ak, String sk) {
  210. // 获取token地址
  211. String authHost = "https://aip.baidubce.com/oauth/2.0/token?";
  212. String getAccessTokenUrl = authHost
  213. // 1. grant_type为固定参数
  214. + "grant_type=client_credentials"
  215. // 2. 官网获取的 API Key
  216. + "&client_id=" + ak
  217. // 3. 官网获取的 Secret Key
  218. + "&client_secret=" + sk;
  219. try {
  220. URL realUrl = new URL(getAccessTokenUrl);
  221. // 打开和URL之间的连接
  222. HttpURLConnection connection = (HttpURLConnection) realUrl.openConnection();
  223. connection.setRequestMethod("GET");
  224. connection.connect();
  225. // 获取所有响应头字段
  226. Map<String, List<String>> map = connection.getHeaderFields();
  227. // 遍历所有的响应头字段
  228. for (String key : map.keySet()) {
  229. System.err.println(key + "--->" + map.get(key));
  230. }
  231. // 定义 BufferedReader输入流来读取URL的响应
  232. BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()));
  233. String result = "";
  234. String line;
  235. while ((line = in.readLine()) != null) {
  236. result += line;
  237. }
  238. /**
  239. * 返回结果示例
  240. */
  241. System.err.println("result:" + result);
  242. JSONObject jsonObject = JSONObject.parseObject(result);
  243. String access_token = jsonObject.getString("access_token");
  244. return access_token;
  245. } catch (Exception e) {
  246. System.err.print("获取token失败!");
  247. e.printStackTrace(System.err);
  248. }
  249. return null;
  250. }
  251. }