Crawler4LoveFootServiceImpl.java 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366
  1. package top.lvzhiqiang.service.impl;
  2. import lombok.extern.slf4j.Slf4j;
  3. import org.jsoup.Connection;
  4. import org.jsoup.Jsoup;
  5. import org.jsoup.nodes.Document;
  6. import org.jsoup.nodes.Element;
  7. import org.jsoup.select.Elements;
  8. import org.springframework.beans.factory.annotation.Value;
  9. import org.springframework.scheduling.annotation.Async;
  10. import org.springframework.stereotype.Service;
  11. import org.springframework.transaction.annotation.Propagation;
  12. import org.springframework.transaction.annotation.Transactional;
  13. import org.springframework.util.StopWatch;
  14. import top.lvzhiqiang.entity.CrawlerLoveFoot;
  15. import top.lvzhiqiang.entity.DicCode;
  16. import top.lvzhiqiang.exception.BusinessException;
  17. import top.lvzhiqiang.mapper.CrawlerLoveFootMapper;
  18. import top.lvzhiqiang.mapper.DicCodeMapper;
  19. import top.lvzhiqiang.mapper.VideoSitePoolMapper;
  20. import top.lvzhiqiang.service.Crawler4LoveFootService;
  21. import top.lvzhiqiang.util.DateUtils;
  22. import top.lvzhiqiang.util.JsoupUtil;
  23. import top.lvzhiqiang.util.StringUtils;
  24. import javax.annotation.Resource;
  25. import java.io.*;
  26. import java.net.InetSocketAddress;
  27. import java.net.Proxy;
  28. import java.nio.charset.StandardCharsets;
  29. import java.time.LocalDate;
  30. import java.time.LocalDateTime;
  31. import java.util.HashMap;
  32. import java.util.List;
  33. import java.util.Map;
  34. import java.util.UUID;
  35. import java.util.stream.Collectors;
  36. /**
  37. * Crawler LoveFoot ServiceImpl
  38. *
  39. * @author lvzhiqiang
  40. * 2022/10/17 14:47
  41. */
  42. @Service
  43. @Slf4j
  44. public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
  45. @Resource
  46. private DicCodeMapper dicCodeMapper;
  47. @Resource
  48. private CrawlerLoveFootMapper crawlerLoveFootMapper;
  49. @Resource
  50. private VideoSitePoolMapper videoSitePoolMapper;
  51. @Value("${spring.profiles.active}")
  52. private String env;
  53. Map<String, String> footConstantMap = null;
  54. Map<String, String> javbusConstantMap = null;
  55. List<String> javbusUrlList = null;
  56. Map<String, String> headerMap = new HashMap<>();
  57. Map<String, String> header2Map = new HashMap<>();
  58. Proxy proxy = null;
  59. public void beforeProxy() {
  60. if (null == proxy) {
  61. if ("dev".equals(env)) {
  62. proxy = new Proxy(Proxy.Type.SOCKS, new InetSocketAddress("127.0.0.1", 1080));
  63. } else {
  64. proxy = Proxy.NO_PROXY;
  65. }
  66. }
  67. }
  68. @Async
  69. @Override
  70. @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
  71. public void jsoupLoveFoot4avnoashi(Integer status, Integer isDel, Integer ignoreRetryCount) throws Exception {
  72. log.warn("jsoupFoot4avnoashi 开始:status={},isDel={},ignoreRetryCount={}", status, isDel, ignoreRetryCount);
  73. StopWatch stopWatch = new StopWatch();
  74. stopWatch.start();
  75. if (isDel == 1) {
  76. crawlerLoveFootMapper.deleteAll();
  77. }
  78. List<DicCode> dicCodeList = dicCodeMapper.findAll();
  79. // 获取常量MAP
  80. footConstantMap = dicCodeList.stream()
  81. .filter(x -> "foot".equals(x.getCodeDesc()) && x.getEnv().contains(env))
  82. .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
  83. javbusConstantMap = dicCodeList.stream()
  84. .filter(x -> x.getType() != null && 1 == x.getType() && x.getEnv().contains(env))
  85. .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
  86. // 获取javbus防屏蔽地址
  87. javbusUrlList = videoSitePoolMapper.findUrlByTypeAndDeleteFlag(1, 1);
  88. if (javbusUrlList.size() == 0) {
  89. log.warn("javbusUrlList为空");
  90. return;
  91. }
  92. // 代理及TOKEN设置
  93. beforeProxy();
  94. // 解析原始站点
  95. jsoupLoveFoot4avnoashiSub(status, ignoreRetryCount);
  96. log.warn("jsoupFoot4avnoashi 结束:time={}", stopWatch.getTotalTimeSeconds());
  97. }
  98. @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
  99. public void jsoupLoveFoot4avnoashiSub(Integer status, Integer ignoreRetryCount) throws Exception {
  100. CrawlerLoveFoot latestLoveFoot = crawlerLoveFootMapper.findLatestInfo();
  101. LocalDate latestDate;
  102. if (latestLoveFoot == null) {
  103. latestDate = LocalDate.of(1970, 1, 1);
  104. } else {
  105. latestDate = latestLoveFoot.getUpdateDate();
  106. }
  107. String avnoashiUrl = footConstantMap.get("avnoashi_url");
  108. headerMap.put("referer", avnoashiUrl);
  109. header2Map.put("referer", avnoashiUrl.concat("?sort=newer"));
  110. Document loveFootDocument;
  111. Document loveFootDetailDocument;
  112. outer:
  113. while (true) {
  114. loveFootDocument = JsoupUtil.requestDocument(avnoashiUrl, JsoupUtil.HTTP_GET, proxy, null, headerMap, null);
  115. log.warn("jsoupLoveFoot4avnoashiSub page success:url={}", avnoashiUrl);
  116. Elements sourceSelects = loveFootDocument.select("div.dividerBottom > div.archive").select("div.archive__contents").select("h2");
  117. for (Element sourceSelect : sourceSelects) {
  118. String sourceUrl = sourceSelect.select("a").attr("abs:href");
  119. Integer statusInt = 2;
  120. Integer typeInt = 1;
  121. LocalDate clockDate = null;
  122. LocalDate updateDate = null;
  123. String keywords = null;
  124. try {
  125. loveFootDetailDocument = JsoupUtil.requestDocument(sourceUrl, JsoupUtil.HTTP_GET, proxy, null, header2Map, null);
  126. String clockDateStr = loveFootDetailDocument.select("div.viral").select("li.icon-clock").text();
  127. String updateDateStr = loveFootDetailDocument.select("div.viral").select("li.icon-update").text();
  128. clockDate = LocalDate.parse(clockDateStr, DateUtils.dateFormatter3);
  129. updateDate = LocalDate.parse(updateDateStr, DateUtils.dateFormatter3);
  130. if (updateDate.isBefore(latestDate) || updateDate.isEqual(latestDate)) {
  131. break outer;
  132. }
  133. // 获取关键词
  134. keywords = loveFootDetailDocument.select("div.postContents").select("td:contains(タイトル)").next("td").text();
  135. if (StringUtils.isNotEmpty(keywords)) {
  136. statusInt = 1;
  137. log.warn("jsoupLoveFoot4avnoashiSub parseDetailToKeywords success,sourceUrl={},keywords={}", sourceUrl, keywords);
  138. } else {
  139. throw new Exception("keywords is null");
  140. }
  141. // 通过关键词获取识别码
  142. CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
  143. crawlerLoveFoot.setClockDate(clockDate);
  144. crawlerLoveFoot.setUpdateDate(updateDate);
  145. crawlerLoveFoot.setOrginUrl(sourceUrl);
  146. crawlerLoveFoot.setType(2);
  147. crawlerLoveFoot.setStatus(3);
  148. String message = parseKeywordsToCode(crawlerLoveFoot, keywords);
  149. if (StringUtils.isNotEmpty(message)) {
  150. statusInt = 4;
  151. throw new Exception(message);
  152. }
  153. crawlerLoveFootMapper.insertOrUpdate(crawlerLoveFoot);
  154. } catch (Exception e) {
  155. log.error("jsoupLoveFoot4avnoashiSub detail fail,sourceUrl={}", sourceUrl, e);
  156. CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
  157. crawlerLoveFoot.setIdentificationCode(UUID.randomUUID().toString());
  158. crawlerLoveFoot.setOrginUrl(sourceUrl);
  159. crawlerLoveFoot.setClockDate(clockDate);
  160. crawlerLoveFoot.setUpdateDate(updateDate);
  161. crawlerLoveFoot.setName(keywords);
  162. crawlerLoveFoot.setType(typeInt);
  163. crawlerLoveFoot.setStatus(statusInt);
  164. crawlerLoveFoot.setCreateTime(LocalDateTime.now());
  165. crawlerLoveFoot.setFailureCause(e.getMessage());
  166. crawlerLoveFootMapper.insertOrUpdate(crawlerLoveFoot);
  167. }
  168. }
  169. // 继续下一页
  170. Elements nextSelects = loveFootDocument.select("ul.pager").select("a:contains(Next)");
  171. if (nextSelects.size() > 0) {
  172. avnoashiUrl = nextSelects.get(0).attr("abs:href");
  173. } else {
  174. break;
  175. }
  176. }
  177. }
  178. private String parseKeywordsToCode(CrawlerLoveFoot crawlerLoveFoot, String keywords) {
  179. int retryCount = 0;
  180. Document javbusSearchDocument;
  181. Document javbusCodeDocument;
  182. String message = null;
  183. while (retryCount <= 3) {
  184. long start = System.currentTimeMillis();
  185. String javbusUrl = javbusUrlList.get((int) (0 + Math.random() * (javbusUrlList.size())));
  186. String javbusSearchUrl = javbusUrl.concat("/search/").concat(keywords).concat("&parent=ce");
  187. try {
  188. javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
  189. Elements itembSelects = javbusSearchDocument.select("div#waterfall").select("div.item");
  190. if (itembSelects.size() == 0) {
  191. throw new BusinessException(30000, "search result null");
  192. }
  193. // 获取codeUrl
  194. String codeUrl = itembSelects.select("a.movie-box").get(0).attr("abs:href");
  195. // 解析codeUrl
  196. javbusCodeDocument = JsoupUtil.requestDocument(codeUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
  197. long picTime = parseJavbusCodeDocument(javbusCodeDocument, crawlerLoveFoot);
  198. crawlerLoveFoot.setRetryCount(retryCount);
  199. log.warn("jsoupLoveFoot4avnoashiSub parseKeywordsToCode success,keywords={},code={},picTime={},time={}", keywords, crawlerLoveFoot.getIdentificationCode(), picTime, System.currentTimeMillis() - start);
  200. break;
  201. } catch (Exception e) {
  202. ++retryCount;
  203. if (retryCount < 4) {
  204. log.error("javbusSearch error重试:,retryCount={},time={},keywords={}", retryCount, System.currentTimeMillis() - start, keywords, e);
  205. } else if (retryCount == 4) {
  206. message = e.getMessage().length() <= 200 ? e.getMessage() : e.getMessage().substring(0, 200);
  207. }
  208. if (e instanceof BusinessException) {
  209. message = e.getMessage().length() <= 200 ? e.getMessage() : e.getMessage().substring(0, 200);
  210. break;
  211. }
  212. }
  213. }
  214. return message;
  215. }
  216. private long parseJavbusCodeDocument(Document document, CrawlerLoveFoot crawlerLoveFoot) throws Exception {
  217. Elements container = document.select("div.container");
  218. if (container.size() == 0) {
  219. throw new BusinessException(30000, "番号无效!");
  220. }
  221. // 名称
  222. String h3 = container.select("h3").first().text();
  223. String[] nameArr = h3.split("\\s+");
  224. if (nameArr.length > 1) {
  225. crawlerLoveFoot.setName(h3.substring(nameArr[0].length()).trim());
  226. } else {
  227. crawlerLoveFoot.setName(nameArr[0]);
  228. }
  229. Elements pEles = container.select("div.info > p");
  230. // 识别码
  231. Element pEle = pEles.get(0);
  232. String iCode = pEle.select("span[style]").first().text();
  233. crawlerLoveFoot.setIdentificationCode(iCode);
  234. // 发行日期
  235. pEle = pEles.get(1);
  236. String issueDate = pEle.text().split(":")[1].replace("\"", "").trim();
  237. crawlerLoveFoot.setIssueDate(LocalDate.parse(issueDate, DateUtils.dateFormatter));
  238. // 长度
  239. pEle = pEles.get(2);
  240. String length = pEle.text().split(":")[1].replace("\"", "").trim();
  241. crawlerLoveFoot.setLength(length);
  242. // 导演
  243. Elements directorEles = container.select("div.info").select("p:contains(導演)");
  244. if (directorEles.size() > 0) {
  245. pEle = directorEles.first().select("a[href]").first();
  246. crawlerLoveFoot.setDirector(pEle.text());
  247. }
  248. // 制作商
  249. Elements markerEles = container.select("div.info").select("p:contains(製作商)");
  250. if (markerEles.size() > 0) {
  251. pEle = markerEles.first().select("a[href]").first();
  252. crawlerLoveFoot.setMaker(pEle.text());
  253. }
  254. // 发行商
  255. Elements issuerEles = container.select("div.info").select("p:contains(發行商)");
  256. if (issuerEles.size() > 0) {
  257. pEle = issuerEles.first().select("a[href]").first();
  258. crawlerLoveFoot.setIssuer(pEle.text());
  259. }
  260. // 类别
  261. Elements genresEles = container.select("div.info").select("p:contains(類別)");
  262. if (genresEles.size() > 0) {
  263. StringBuffer sb = new StringBuffer();
  264. Elements ahrefEles = genresEles.first().nextElementSibling().select("a[href]");
  265. for (Element ahrefEle : ahrefEles) {
  266. sb.append(ahrefEle.text()).append(",");
  267. }
  268. if (sb.length() > 0) {
  269. sb = sb.deleteCharAt(sb.length() - 1);
  270. }
  271. crawlerLoveFoot.setGenres(sb.toString());
  272. }
  273. // 演员
  274. Elements castEles = container.select("div.info").select("p.star-show:contains(演員)");
  275. if (castEles.size() > 0) {
  276. Elements castElesTemp = container.select("div.info:contains(暫無出演者資訊)");
  277. if (castElesTemp.size() == 0) {
  278. StringBuffer sb = new StringBuffer();
  279. Elements ahrefEles = castEles.first().nextElementSibling().nextElementSibling().select("a[href]");
  280. for (Element ahrefEle : ahrefEles) {
  281. sb.append(ahrefEle.text()).append(",");
  282. }
  283. if (sb.length() > 0) {
  284. sb = sb.deleteCharAt(sb.length() - 1);
  285. }
  286. crawlerLoveFoot.setCast(sb.toString());
  287. }
  288. }
  289. // 图片URL
  290. String href = container.select("a.bigImage").first().attr("abs:href");
  291. long start = System.currentTimeMillis();
  292. Connection.Response response = Jsoup.connect(href).method(Connection.Method.GET).ignoreContentType(true).timeout(50 * 1000).execute();
  293. String fileName = issueDate.concat(" ").concat(h3);
  294. byte[] imgUrlBytes = fileName.getBytes(StandardCharsets.UTF_8);
  295. if (imgUrlBytes.length > 251) {
  296. byte[] imgUrlDestBytes = new byte[251];
  297. System.arraycopy(imgUrlBytes, 0, imgUrlDestBytes, 0, 251);
  298. fileName = new String(imgUrlDestBytes, StandardCharsets.UTF_8).replace("�", "");
  299. }
  300. fileName = fileName.concat(".jpg");
  301. String machiImgUrl = "足舐/".concat(fileName);
  302. saveFile(response.bodyStream(), javbusConstantMap.get("apics_path").concat(machiImgUrl));
  303. long end = System.currentTimeMillis();
  304. crawlerLoveFoot.setImgUrl(machiImgUrl);
  305. crawlerLoveFoot.setCreateTime(LocalDateTime.now());
  306. return end - start;
  307. }
  308. /**
  309. * 保存文件到本地
  310. *
  311. * @param bufferedInputStream
  312. * @param savePath
  313. */
  314. private void saveFile(BufferedInputStream bufferedInputStream, String savePath) throws IOException {
  315. //一次最多读取1k
  316. byte[] buffer = new byte[1024];
  317. //实际读取的长度
  318. int readLenghth;
  319. //创建的一个写出的缓冲流
  320. BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(new FileOutputStream(new File(savePath)));
  321. //文件逐步写入本地
  322. while ((readLenghth = bufferedInputStream.read(buffer, 0, 1024)) != -1) {//先读出来,保存在buffer数组中
  323. bufferedOutputStream.write(buffer, 0, readLenghth);//再从buffer中取出来保存到本地
  324. }
  325. //关闭缓冲流
  326. bufferedOutputStream.close();
  327. bufferedInputStream.close();
  328. }
  329. }