Crawler4JavbusServiceImpl.java 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601
  1. package top.lvzhiqiang.service.impl;
  2. import com.alibaba.fastjson.JSONObject;
  3. import com.github.pagehelper.PageHelper;
  4. import com.github.pagehelper.PageInfo;
  5. import com.xxl.job.core.context.XxlJobHelper;
  6. import lombok.extern.slf4j.Slf4j;
  7. import org.jsoup.Connection;
  8. import org.jsoup.HttpStatusException;
  9. import org.jsoup.Jsoup;
  10. import org.jsoup.nodes.Document;
  11. import org.jsoup.nodes.Element;
  12. import org.jsoup.select.Elements;
  13. import org.springframework.beans.factory.annotation.Value;
  14. import org.springframework.scheduling.annotation.Async;
  15. import org.springframework.stereotype.Service;
  16. import org.springframework.transaction.annotation.Propagation;
  17. import org.springframework.transaction.annotation.Transactional;
  18. import org.springframework.util.StopWatch;
  19. import top.lvzhiqiang.config.WebAppConfig;
  20. import top.lvzhiqiang.entity.CrawlerJavbusLog;
  21. import top.lvzhiqiang.entity.CrawlerJavbusProfile;
  22. import top.lvzhiqiang.entity.DicCode;
  23. import top.lvzhiqiang.entity.VideoSitePool;
  24. import top.lvzhiqiang.mapper.CrawlerJavbusProfileMapper;
  25. import top.lvzhiqiang.mapper.DicCodeMapper;
  26. import top.lvzhiqiang.mapper.VideoSitePoolMapper;
  27. import top.lvzhiqiang.service.Crawler4JavbusService;
  28. import top.lvzhiqiang.util.DateUtils;
  29. import top.lvzhiqiang.util.JsoupUtil;
  30. import top.lvzhiqiang.util.StringUtils;
  31. import javax.annotation.Resource;
  32. import java.io.BufferedReader;
  33. import java.io.InputStreamReader;
  34. import java.net.HttpURLConnection;
  35. import java.net.InetSocketAddress;
  36. import java.net.Proxy;
  37. import java.net.URL;
  38. import java.time.LocalDateTime;
  39. import java.util.*;
  40. import java.util.stream.Collectors;
  41. /**
  42. * Crawler Javbus ServiceImpl
  43. *
  44. * @author lvzhiqiang
  45. * 2022/10/17 14:47
  46. */
  47. @Service
  48. @Slf4j
  49. public class Crawler4JavbusServiceImpl implements Crawler4JavbusService {
  50. @Resource
  51. private DicCodeMapper dicCodeMapper;
  52. @Resource
  53. private VideoSitePoolMapper videoSitePoolMapper;
  54. @Resource
  55. private CrawlerJavbusProfileMapper crawlerJavbusProfileMapper;
  56. @Value("${spring.profiles.active}")
  57. private String env;
  58. Map<String, String> javbusConstantMap = null;
  59. Map<String, String> javbusCookiesMap = null;
  60. String bdAccessToken = "";
  61. Proxy proxy = null;
  62. public void beforeJavbus() throws Exception {
  63. if (null == proxy) {
  64. if ("dev".equals(env)) {
  65. proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("127.0.0.1", 1080));
  66. } else {
  67. proxy = Proxy.NO_PROXY;
  68. }
  69. }
  70. if (StringUtils.isEmpty(bdAccessToken)) {
  71. bdAccessToken = getAuth(javbusConstantMap.get("bd_ak"), javbusConstantMap.get("bd_sk"));
  72. }
  73. if (null == javbusCookiesMap) {
  74. for (int i = 0; i < 3; i++) {
  75. if (generateJavbusCookies(proxy)) {
  76. break;
  77. } else {
  78. javbusCookiesMap = null;
  79. }
  80. }
  81. if (javbusCookiesMap == null) {
  82. throw new Exception("获取javbusCookies失败!");
  83. }
  84. }
  85. }
  86. @Override
  87. public Map<String, String> getJavbusCookiesMap() throws Exception {
  88. // 获取javbus常量MAP
  89. if (javbusConstantMap == null) {
  90. javbusConstantMap = dicCodeMapper.findAll().stream()
  91. .filter(x -> "javbus".equals(x.getCodeDesc()) && x.getEnv().contains(env))
  92. .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
  93. }
  94. // 代理及TOKEN设置
  95. beforeJavbus();
  96. return javbusCookiesMap;
  97. }
  98. @Override
  99. @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
  100. public void checkJavbusVideoSite() {
  101. XxlJobHelper.log("checkVideoSite开始==============================");
  102. // 获取javbus官方地址
  103. DicCode dicCode = WebAppConfig.dicCodeList.stream().filter(x -> 2 == x.getType() && "javbus".equals(x.getCodeKey())).findFirst().get();
  104. if (dicCode == null) {
  105. XxlJobHelper.log("javbus官方站点为Null");
  106. return;
  107. }
  108. // 获取javbusUrlList
  109. List<String> javbusUrlList = videoSitePoolMapper.findUrlByType(1);
  110. // 获取javbusNewUrlList
  111. Set<String> javbusNewUrlList = new HashSet<>();
  112. try {
  113. Document document = Jsoup.connect(dicCode.getCodeValue()).timeout(50000).ignoreContentType(true)
  114. .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36")
  115. .header("referer", "https://www.javbus.com/").get();
  116. Elements ahrefList = document.select("strong:contains(防屏蔽地址)").next("a");
  117. for (Element element : ahrefList) {
  118. String text = element.text();
  119. XxlJobHelper.log("Jsoup获取{}防屏蔽地址:{}", dicCode.getCodeValue(), text);
  120. javbusNewUrlList.add(text);
  121. }
  122. } catch (Exception e) {
  123. log.error("Jsoup获取{}防屏蔽地址异常", dicCode.getCodeValue(), e);
  124. XxlJobHelper.log("Jsoup获取{}防屏蔽地址异常", dicCode.getCodeValue());
  125. XxlJobHelper.log(e);
  126. }
  127. if (javbusNewUrlList.size() == 0) {
  128. XxlJobHelper.log("javbusNewUrlList为空");
  129. }
  130. if (javbusNewUrlList.size() == 0 && javbusUrlList.size() == 0) {
  131. XxlJobHelper.log("javbusUrlList和javbusNewUrlList为空");
  132. return;
  133. }
  134. // 校验新地址
  135. List<String> javbusNewUrlFinalList = javbusNewUrlList.stream().filter(e -> !javbusUrlList.contains(e)).collect(Collectors.toList());
  136. List<VideoSitePool> videoSitePoolList = new ArrayList<>();
  137. VideoSitePool videoSitePool;
  138. for (String javbusNewUrlFinal : javbusNewUrlFinalList) {
  139. try {
  140. Jsoup.connect(javbusNewUrlFinal).timeout(50000);
  141. videoSitePool = new VideoSitePool();
  142. videoSitePool.setUrl(javbusNewUrlFinal);
  143. videoSitePool.setType(1);
  144. videoSitePoolList.add(videoSitePool);
  145. XxlJobHelper.log("javbusNewUrlFinalList:javbus防屏蔽地址有效!javbusUrl={}", javbusNewUrlFinal);
  146. } catch (Exception e) {
  147. log.error("javbusNewUrlFinalList:javbus防屏蔽地址失效!javbusUrl={}", javbusNewUrlFinal, e);
  148. XxlJobHelper.log("javbusNewUrlFinalList:javbus防屏蔽地址失效!javbusUrl={}", javbusNewUrlFinal);
  149. XxlJobHelper.log(e);
  150. }
  151. }
  152. if (videoSitePoolList.size() > 0) {
  153. videoSitePoolMapper.insertList(videoSitePoolList);
  154. }
  155. // 校验存量地址
  156. for (String javbusUrl : javbusUrlList) {
  157. int deleteFlag = 1;
  158. try {
  159. Jsoup.connect(javbusUrl).timeout(50000);
  160. XxlJobHelper.log("javbusUrlList:javbus防屏蔽地址有效!javbusUrl={}", javbusUrl);
  161. } catch (Exception e) {
  162. deleteFlag = 2;
  163. log.error("javbusUrlList:javbus防屏蔽地址失效!javbusUrl={}", javbusUrl, e);
  164. XxlJobHelper.log("javbusUrlList:javbus防屏蔽地址失效!javbusUrl={}", javbusUrl);
  165. XxlJobHelper.log(e);
  166. }
  167. videoSitePoolMapper.updateDeleteFlag(javbusUrl, deleteFlag);
  168. }
  169. XxlJobHelper.log("checkVideoSite结束==============================");
  170. }
  171. @Async
  172. @Override
  173. public void jsoupJavbusProfile(Long start, Integer limit) throws Exception {
  174. log.warn("jsoupJavbusProfile 开始:start={},limit={}", start, limit);
  175. StopWatch stopWatch = new StopWatch();
  176. stopWatch.start();
  177. // 获取javbus常量MAP
  178. javbusConstantMap = dicCodeMapper.findAll().stream()
  179. .filter(x -> "javbus".equals(x.getCodeDesc()) && x.getEnv().contains(env))
  180. .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
  181. // 代理及TOKEN设置
  182. beforeJavbus();
  183. // 获取个人资料
  184. jsoupJavbusProfileSub(start, limit);
  185. log.warn("jsoupJavbusProfile 结束:time={}", stopWatch.getTotalTimeSeconds());
  186. }
  187. @Async
  188. @Override
  189. public void handleJavbusLog(Integer status) throws Exception {
  190. log.warn("handleJavbusLog 开始:status={}", status);
  191. StopWatch stopWatch = new StopWatch();
  192. stopWatch.start();
  193. // 获取javbus常量MAP
  194. javbusConstantMap = dicCodeMapper.findAll().stream()
  195. .filter(x -> "javbus".equals(x.getCodeDesc()) && x.getEnv().contains(env))
  196. .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
  197. // 代理及TOKEN设置
  198. beforeJavbus();
  199. // 获取个人资料
  200. handleJavbusLogSub(status);
  201. log.warn("handleJavbusLog 结束:time={}", stopWatch.getTotalTimeSeconds());
  202. }
  203. @Override
  204. public String findJavbusProfile(String keyword, Integer timeDay, Integer pic, String orderField, String order, Integer pageNo, Integer pageSize) {
  205. Map<String, Object> params = new HashMap<>();
  206. params.put("keyword", keyword);
  207. params.put("timeDay", timeDay);
  208. params.put("orderField", orderField);
  209. params.put("order", order);
  210. PageHelper.startPage(pageNo, pageSize);
  211. List<CrawlerJavbusProfile> crawlerJavbusProfileList = crawlerJavbusProfileMapper.findJavbusProfile4MultipleParams(params);
  212. PageInfo<CrawlerJavbusProfile> javbusProfilePageInfo = new PageInfo<>(crawlerJavbusProfileList);
  213. StringBuffer sb = new StringBuffer("total:".concat(String.valueOf(javbusProfilePageInfo.getTotal())).concat("<br/>"));
  214. sb.append("<table border=\"1\" cellspacing=\"0\"><tr><th>UID</th><th>昵称</th><th>邮箱状态</th><th>好友数</th><th>回帖数</th><th>主题数</th><th>用户组</th><th>在线时间</th><th>注册时间</th><th>上次活动时间</th><th>上次发表时间</th><th>所在时区</th><th>头像</th><th>个人签名文字</th><th>个人签名图片</th></tr>");
  215. for (CrawlerJavbusProfile crawlerJavbusProfile : crawlerJavbusProfileList) {
  216. sb.append("<tr>");
  217. sb.append("<td>").append(crawlerJavbusProfile.getUid()).append("</td>");
  218. sb.append("<td>").append(crawlerJavbusProfile.getNickName()).append("</td>");
  219. sb.append("<td>").append(crawlerJavbusProfile.getEmailStatus()).append("</td>");
  220. sb.append("<td>").append(crawlerJavbusProfile.getFriendNum()).append("</td>");
  221. sb.append("<td>").append(crawlerJavbusProfile.getReplyNum()).append("</td>");
  222. sb.append("<td>").append(crawlerJavbusProfile.getThreadNum()).append("</td>");
  223. sb.append("<td>").append(crawlerJavbusProfile.getUserGroup()).append("</td>");
  224. sb.append("<td>").append(crawlerJavbusProfile.getOnlineTime()).append("</td>");
  225. sb.append("<td>").append(crawlerJavbusProfile.getRegistrationTime()).append("</td>");
  226. sb.append("<td>").append(crawlerJavbusProfile.getLastActivityTime()).append("</td>");
  227. sb.append("<td>").append(crawlerJavbusProfile.getLastPublishedTime()).append("</td>");
  228. sb.append("<td>").append(crawlerJavbusProfile.getTimeZone()).append("</td>");
  229. if (pic == 2) {
  230. sb.append("<td>").append("<img src=\"" + crawlerJavbusProfile.getAvatarUrl() + "\" alt=\"封面\" width=\"147\" height=\"auto\" referrerpolicy=\"no-referrer\">").append("</td>");
  231. } else {
  232. sb.append("<td>").append(crawlerJavbusProfile.getAvatarUrl()).append("</td>");
  233. }
  234. sb.append("<td>").append(crawlerJavbusProfile.getSignStr()).append("</td>");
  235. if (pic == 2) {
  236. sb.append("<td>");
  237. String signImg = crawlerJavbusProfile.getSignImg();
  238. if (StringUtils.isNotEmpty(signImg)) {
  239. for (String s : signImg.split(",")) {
  240. sb.append("<img src=\"" + s + "\" alt=\"sign\" width=\"147\" height=\"auto\">");
  241. }
  242. } else {
  243. sb.append("--");
  244. }
  245. sb.append("</td>");
  246. } else {
  247. if (StringUtils.isNotEmpty(crawlerJavbusProfile.getSignImg())) {
  248. sb.append("<td>").append(crawlerJavbusProfile.getSignImg()).append("</td>");
  249. } else {
  250. sb.append("<td>--</td>");
  251. }
  252. }
  253. sb.append("</tr>");
  254. }
  255. sb.append("</table>");
  256. return sb.toString();
  257. }
  258. private void handleJavbusLogSub(Integer status) {
  259. List<CrawlerJavbusLog> javbusLogByStatus = crawlerJavbusProfileMapper.findJavbusLogByStatus(status);
  260. String profileUrl = "https://www.javbus.com/forum/?";
  261. Document profileDocument;
  262. for (CrawlerJavbusLog javbusLog : javbusLogByStatus) {
  263. try {
  264. profileDocument = JsoupUtil.requestDocument(profileUrl.concat(javbusLog.getBusinessKey()), JsoupUtil.HTTP_GET, proxy, javbusCookiesMap, null, null);
  265. if (profileDocument.html().contains("您指定的用戶空間不存在")) {
  266. log.warn("jsoupJavbusProfileSub您指定的用戶空間不存在,start={}", javbusLog.getBusinessKey());
  267. crawlerJavbusProfileMapper.deleteJavbusLogById2(javbusLog.getId(), "您指定的用戶空間不存在");
  268. continue;
  269. }
  270. if (profileDocument.html().contains("空間已被鎖定無法訪問")) {
  271. log.warn("jsoupJavbusProfileSub空間已被鎖定無法訪問,start={}", javbusLog.getBusinessKey());
  272. crawlerJavbusProfileMapper.deleteJavbusLogById2(javbusLog.getId(), "空間已被鎖定無法訪問,如有疑問請聯繫管理員");
  273. continue;
  274. }
  275. CrawlerJavbusProfile crawlerJavbusProfile = new CrawlerJavbusProfile();
  276. parseJavbusProfile(profileDocument, crawlerJavbusProfile);
  277. crawlerJavbusProfileMapper.insertOrUpdate(crawlerJavbusProfile);
  278. log.warn("jsoupJavbusProfileSub成功插入,businessKey={}", javbusLog.getBusinessKey());
  279. javbusLog.setStatus(2);
  280. javbusLog.setErrorMsg("");
  281. } catch (Exception e) {
  282. log.error("jsoupJavbusProfileSub插入异常,businessKey={}", javbusLog.getBusinessKey(), e);
  283. javbusLog.setStatus(3);
  284. javbusLog.setErrorMsg(e.getMessage());
  285. }
  286. crawlerJavbusProfileMapper.insertOrUpdateLog(javbusLog);
  287. }
  288. }
  289. private void jsoupJavbusProfileSub(Long start, Integer limit) {
  290. CrawlerJavbusProfile latestJavbusProfile = crawlerJavbusProfileMapper.findLatestInfo();
  291. if (start == null && latestJavbusProfile == null) {
  292. start = 1L;
  293. } else if (start == null && latestJavbusProfile != null) {
  294. start = latestJavbusProfile.getUid() + 1;
  295. }
  296. long startFinal = 0;
  297. if (limit != null) {
  298. startFinal = start + limit;
  299. }
  300. String profileUrl = "https://www.javbus.com/forum/?";
  301. Document profileDocument;
  302. int continueCount = 0;
  303. while (true) {
  304. if (startFinal != 0 && start > startFinal) {
  305. log.warn("jsoupJavbusProfileSub结束,start={},startFinal={}", start, startFinal);
  306. return;
  307. }
  308. if (continueCount > 10) {
  309. log.warn("jsoupJavbusProfileSub结束,start={},continueCount={}", start, continueCount);
  310. return;
  311. }
  312. try {
  313. profileDocument = JsoupUtil.requestDocument(profileUrl.concat(String.valueOf(start)), JsoupUtil.HTTP_GET, proxy, javbusCookiesMap, null, null);
  314. if (profileDocument.html().contains("您指定的用戶空間不存在")) {
  315. log.warn("jsoupJavbusProfileSub您指定的用戶空間不存在,start={}", start);
  316. start++;
  317. if (start > 500000) {
  318. continueCount++;
  319. }
  320. continue;
  321. }
  322. CrawlerJavbusProfile crawlerJavbusProfile = new CrawlerJavbusProfile();
  323. parseJavbusProfile(profileDocument, crawlerJavbusProfile);
  324. crawlerJavbusProfileMapper.insertOrUpdate(crawlerJavbusProfile);
  325. log.warn("jsoupJavbusProfileSub成功插入,start={}", start);
  326. } catch (Exception e) {
  327. log.error("jsoupJavbusProfileSub插入异常,start={}", start, e);
  328. CrawlerJavbusLog crawlerJavbusLog = new CrawlerJavbusLog();
  329. crawlerJavbusLog.setType(1);
  330. crawlerJavbusLog.setStatus(1);
  331. crawlerJavbusLog.setBusinessKey(String.valueOf(start));
  332. crawlerJavbusLog.setErrorMsg(e.getMessage());
  333. crawlerJavbusProfileMapper.insertOrUpdateLog(crawlerJavbusLog);
  334. }
  335. start++;
  336. }
  337. }
  338. public void parseJavbusProfile(Document profileDocument, CrawlerJavbusProfile crawlerJavbusProfile) {
  339. String avatarUrl = profileDocument.select("div.avt").select("img").attr("src");
  340. String[] mbn0Arr = profileDocument.select("div.u_profile").select("div.cl").get(0).select("h2.mbn").get(0).text().replace("(", "").replace(")", "").split("UID:");
  341. String nickName = mbn0Arr[0].trim();
  342. String uid = mbn0Arr[1].trim();
  343. String emailStatus = profileDocument.select("div.u_profile").select("div.cl").get(0).select("ul").first().text().replace("郵箱狀態", "").trim();
  344. Elements signEles = profileDocument.select("div.u_profile").select("div.cl").get(0).select("ul").get(1).select("li:contains(個人簽名)");
  345. String signStr = "";
  346. ArrayList<String> signImgList = new ArrayList<>();
  347. if (signEles.size() > 0) {
  348. signStr = signEles.first().select("table").text();
  349. Elements signImgEles = signEles.first().select("table").select("img");
  350. for (Element signImgEle : signImgEles) {
  351. signImgList.add(signImgEle.attr("src"));
  352. }
  353. }
  354. String friendNum = profileDocument.select("div.u_profile").select("div.cl").get(0).select("ul").get(2)
  355. .select("a").get(0).text().replace("好友數", "").trim();
  356. String replyNum = profileDocument.select("div.u_profile").select("div.cl").get(0).select("ul").get(2)
  357. .select("a").get(1).text().replace("回帖數", "").trim();
  358. String threadNum = profileDocument.select("div.u_profile").select("div.cl").get(0).select("ul").get(2)
  359. .select("a").get(2).text().replace("主題數", "").trim();
  360. String userGroup = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(0)
  361. .select("a").text();
  362. String onlineTime = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
  363. .select("li:contains(在線時間)").text().replace("在線時間", "").replace("小時", "").trim();
  364. String registrationTime = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
  365. .select("li:contains(註冊時間)").text().replace("註冊時間", "").trim();
  366. String lastVisit = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
  367. .select("li:contains(最後訪問)").text().replace("最後訪問", "").trim();
  368. String lastActivityTime = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
  369. .select("li:contains(上次活動時間)").text().replace("上次活動時間", "").trim();
  370. String lastPublishedTime = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
  371. .select("li:contains(上次發表時間)").text().replace("上次發表時間", "").trim();
  372. String timeZone = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
  373. .select("li:contains(所在時區)").text().replace("所在時區", "").trim();
  374. String usedSpace = profileDocument.select("div.u_profile").select("div.cl").get(2).select("ul").get(0)
  375. .select("li").get(0).text().replace("已用空間", "").replace("B", "").trim();
  376. String mileage = profileDocument.select("div.u_profile").select("div.cl").get(2).select("ul").get(0)
  377. .select("li").get(1).text().replace("里程", "").trim();
  378. String money = profileDocument.select("div.u_profile").select("div.cl").get(2).select("ul").get(0)
  379. .select("li").last().text().replace("金錢", "").trim();
  380. crawlerJavbusProfile.setUid(Long.valueOf(uid));
  381. crawlerJavbusProfile.setNickName(nickName);
  382. crawlerJavbusProfile.setEmailStatus(emailStatus);
  383. crawlerJavbusProfile.setFriendNum(Integer.valueOf(friendNum));
  384. crawlerJavbusProfile.setReplyNum(Integer.valueOf(replyNum));
  385. crawlerJavbusProfile.setThreadNum(Integer.valueOf(threadNum));
  386. crawlerJavbusProfile.setUserGroup(userGroup);
  387. crawlerJavbusProfile.setOnlineTime(StringUtils.isNotEmpty(onlineTime) ? Integer.valueOf(onlineTime) : null);
  388. crawlerJavbusProfile.setRegistrationTime(StringUtils.isNotEmpty(registrationTime) && registrationTime.length() >= 10 ? LocalDateTime.parse(registrationTime, DateUtils.dateTimeFormatter3) : null);
  389. crawlerJavbusProfile.setLastVisit(StringUtils.isNotEmpty(lastVisit) && lastVisit.length() >= 10 ? LocalDateTime.parse(lastVisit, DateUtils.dateTimeFormatter3) : null);
  390. crawlerJavbusProfile.setLastActivityTime(StringUtils.isNotEmpty(lastActivityTime) && lastActivityTime.length() >= 10 ? LocalDateTime.parse(lastActivityTime, DateUtils.dateTimeFormatter3) : null);
  391. crawlerJavbusProfile.setLastPublishedTime(StringUtils.isNotEmpty(lastPublishedTime) && lastPublishedTime.length() >= 10 ? LocalDateTime.parse(lastPublishedTime, DateUtils.dateTimeFormatter3) : null);
  392. crawlerJavbusProfile.setTimeZone(StringUtils.isNotEmpty(timeZone) ? timeZone : null);
  393. crawlerJavbusProfile.setUsedSpace(Integer.valueOf(usedSpace));
  394. crawlerJavbusProfile.setMileage(Integer.valueOf(mileage));
  395. crawlerJavbusProfile.setMoney(Integer.valueOf(money));
  396. crawlerJavbusProfile.setAvatarUrl(avatarUrl);
  397. crawlerJavbusProfile.setSignStr(signStr);
  398. crawlerJavbusProfile.setSignImg(org.apache.commons.lang3.StringUtils.join(signImgList, ","));
  399. }
  400. private boolean generateJavbusCookies(Proxy proxy) throws Exception {
  401. // 1 登陆获取cookies
  402. // 1.0 https://www.javbus.com/forum/forum.php
  403. Connection.Response forumResponse = JsoupUtil.requestBody(javbusConstantMap.get("forum_url"), JsoupUtil.HTTP_GET, proxy, null);
  404. Map<String, String> forumCookies = forumResponse.cookies();
  405. log.warn("generateJavbusCookies=>,forum_url={},forumCookies={}", javbusConstantMap.get("forum_url"), forumCookies);
  406. // 1.1 https://www.javbus.com/forum/member.php
  407. Map<String, String> params = new HashMap<>(8);
  408. params.put("mod", "logging");
  409. params.put("action", "login");
  410. params.put("referer", "");
  411. params.put("infloat", "yes");
  412. params.put("handlekey", "login");
  413. params.put("inajax", "1");
  414. params.put("ajaxtarget", "fwin_content_login");
  415. String memberHtmlStr = JsoupUtil.requestDocument(javbusConstantMap.get("member_url"), JsoupUtil.HTTP_GET, proxy, forumCookies, null, params).html().replace("<![CDATA[", "").replace("]]>", "");
  416. Document memberDocument = Jsoup.parse(memberHtmlStr);
  417. String key1 = memberDocument.select("input[type='password']").first().attr("id").split("_")[1];
  418. String key2 = memberDocument.select("span[id^='seccode']").first().attr("id").split("_")[1];
  419. String key3 = memberDocument.select("input[name='formhash']").first().val();
  420. // 1.2 https://www.javbus.com/forum/misc.php
  421. params.clear();
  422. params.put("mod", "seccode");
  423. params.put("action", "update");
  424. params.put("idhash", key2);
  425. params.put("modid", "member::logging");
  426. Document miscDocument = JsoupUtil.requestDocument(javbusConstantMap.get("misc_url"), JsoupUtil.HTTP_GET, proxy, forumCookies, null, params);
  427. String imgVerifyUrl = "https://www.javbus.com/forum/" + miscDocument.select("img[onclick]").first().attr("src");
  428. // 1.3 get verifyImg
  429. Map<String, String> headerParams = new HashMap<>(8);
  430. headerParams.put("referer", javbusConstantMap.get("forum_url"));
  431. Connection.Response imgResponse = JsoupUtil.requestBody(imgVerifyUrl, JsoupUtil.HTTP_GET, proxy, forumCookies, headerParams, null);
  432. byte[] imgBytes = imgResponse.bodyAsBytes();
  433. Map<String, String> imgCookies = imgResponse.cookies();
  434. log.warn("generateJavbusCookies=>,imgVerifyUrl={},imgCookies={}", imgVerifyUrl, imgCookies);
  435. String cookieKey4Seccode = "";
  436. for (Map.Entry<String, String> imgCookie : imgCookies.entrySet()) {
  437. if (imgCookie.getKey().contains("seccode")) {
  438. cookieKey4Seccode = imgCookie.getKey();
  439. break;
  440. }
  441. }
  442. // 1.4 get imgVerifyNumber by BaiduOCR
  443. headerParams.clear();
  444. headerParams.put("Content-Type", "application/x-www-form-urlencoded");
  445. params.clear();
  446. params.put("image", Base64.getEncoder().encodeToString(imgBytes));
  447. JSONObject crAccurateBasicResult = null;
  448. String seccodeverify = "";
  449. for (int i = 0; i < 3; i++) {
  450. try {
  451. Connection.Response ocrResponse = JsoupUtil.requestBody(javbusConstantMap.get("bd_ocr_url").concat("?access_token=").concat(bdAccessToken),
  452. JsoupUtil.HTTP_POST, Proxy.NO_PROXY, headerParams, params);
  453. crAccurateBasicResult = JSONObject.parseObject(ocrResponse.body());
  454. seccodeverify = crAccurateBasicResult.getJSONArray("words_result").getJSONObject(0).getString("words");
  455. break;
  456. } catch (HttpStatusException hse) {
  457. bdAccessToken = getAuth(javbusConstantMap.get("bd_ak"), javbusConstantMap.get("bd_sk"));
  458. } catch (Exception e) {
  459. log.error("BaiduOCR异常,bdOcrUrl={},bdAccessToken={},crAccurateBasicResult={}", javbusConstantMap.get("bd_ocr_url"), bdAccessToken, crAccurateBasicResult, e);
  460. if (i == 2) {
  461. throw new Exception("BaiduOCR异常!");
  462. }
  463. }
  464. }
  465. // 1.5 https://www.javbus.com/forum/member.php
  466. String sbParams = "?mod=logging&action=login&loginsubmit=yes&handlekey=login&loginhash=" + key1 + "&inajax=1";
  467. headerParams.clear();
  468. headerParams.put("Content-Type", "application/x-www-form-urlencoded");
  469. params.clear();
  470. params.put("formhash", key3);
  471. params.put("referer", javbusConstantMap.get("forum_url"));
  472. params.put("loginfield", "username");
  473. params.put("username", javbusConstantMap.get("username"));
  474. params.put("password", javbusConstantMap.get("password"));
  475. params.put("questionid", "0");
  476. params.put("answer", "");
  477. params.put("seccodehash", key2);
  478. params.put("seccodemodid", "member::logging");
  479. params.put("seccodeverify", seccodeverify);
  480. if (cookieKey4Seccode != "") {
  481. forumCookies.put("existmag", "mag");
  482. forumCookies.put(cookieKey4Seccode, imgCookies.get(cookieKey4Seccode));
  483. }
  484. String loginUrl = javbusConstantMap.get("member_url").concat(sbParams);
  485. Connection.Response loginResponse = JsoupUtil.requestBody(loginUrl, JsoupUtil.HTTP_POST, proxy, forumCookies, headerParams, params);
  486. String loginBody = loginResponse.body();
  487. Map<String, String> loginCookies = loginResponse.cookies();
  488. log.warn("generateJavbusCookies=>,loginUrl={},params={},forumCookies={},loginCookies={},loginResponseBody={}", loginUrl, params, forumCookies, loginCookies, loginResponse.body());
  489. for (Map.Entry<String, String> loginCookie : loginCookies.entrySet()) {
  490. if (loginCookie.getKey().contains("ulastactivity")) {
  491. forumCookies.put(loginCookie.getKey(), loginCookie.getValue());
  492. } else if (loginCookie.getKey().contains("auth")) {
  493. forumCookies.put(loginCookie.getKey(), loginCookie.getValue());
  494. } else if (loginCookie.getKey().contains("lastcheckfeed")) {
  495. forumCookies.put(loginCookie.getKey(), loginCookie.getValue());
  496. } else if (loginCookie.getKey().contains("lip")) {
  497. forumCookies.put(loginCookie.getKey(), loginCookie.getValue());
  498. }
  499. }
  500. log.warn("generateJavbusCookies=>,forumFinalCookies={}", forumCookies);
  501. javbusCookiesMap = forumCookies;
  502. return loginBody.contains("歡迎您回來");
  503. }
  504. public String getAuth(String ak, String sk) {
  505. // 获取token地址
  506. String authHost = javbusConstantMap.get("bd_authhost_url");
  507. String getAccessTokenUrl = authHost
  508. // 1. grant_type为固定参数
  509. + "grant_type=client_credentials"
  510. // 2. 官网获取的 API Key
  511. + "&client_id=" + ak
  512. // 3. 官网获取的 Secret Key
  513. + "&client_secret=" + sk;
  514. try {
  515. URL realUrl = new URL(getAccessTokenUrl);
  516. // 打开和URL之间的连接
  517. HttpURLConnection connection = (HttpURLConnection) realUrl.openConnection();
  518. connection.setRequestMethod("GET");
  519. connection.connect();
  520. // 获取所有响应头字段
  521. Map<String, List<String>> map = connection.getHeaderFields();
  522. // 遍历所有的响应头字段
  523. /*for (String key : map.keySet()) {
  524. System.err.println(key + "--->" + map.get(key));
  525. }*/
  526. // 定义 BufferedReader输入流来读取URL的响应
  527. BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()));
  528. String result = "";
  529. String line;
  530. while ((line = in.readLine()) != null) {
  531. result += line;
  532. }
  533. /**
  534. * 返回结果示例
  535. */
  536. // System.err.println("result:" + result);
  537. JSONObject jsonObject = JSONObject.parseObject(result);
  538. String access_token = jsonObject.getString("access_token");
  539. return access_token;
  540. } catch (Exception e) {
  541. System.err.print("获取token失败!");
  542. e.printStackTrace(System.err);
  543. }
  544. return null;
  545. }
  546. }