Crawler4LoveFootServiceImpl.java 44 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909
  1. package top.lvzhiqiang.service.impl;
  2. import lombok.extern.slf4j.Slf4j;
  3. import org.jsoup.Connection;
  4. import org.jsoup.Jsoup;
  5. import org.jsoup.nodes.Document;
  6. import org.jsoup.nodes.Element;
  7. import org.jsoup.select.Elements;
  8. import org.springframework.beans.factory.annotation.Value;
  9. import org.springframework.scheduling.annotation.Async;
  10. import org.springframework.stereotype.Service;
  11. import org.springframework.transaction.annotation.Propagation;
  12. import org.springframework.transaction.annotation.Transactional;
  13. import org.springframework.util.StopWatch;
  14. import top.lvzhiqiang.entity.CrawlerLoveFoot;
  15. import top.lvzhiqiang.entity.DicCode;
  16. import top.lvzhiqiang.exception.BusinessException;
  17. import top.lvzhiqiang.mapper.CrawlerLoveFootMapper;
  18. import top.lvzhiqiang.mapper.DicCodeMapper;
  19. import top.lvzhiqiang.mapper.VideoSitePoolMapper;
  20. import top.lvzhiqiang.service.Crawler4LoveFootService;
  21. import top.lvzhiqiang.util.DateUtils;
  22. import top.lvzhiqiang.util.JsoupUtil;
  23. import top.lvzhiqiang.util.StringUtils;
  24. import javax.annotation.Resource;
  25. import java.io.*;
  26. import java.net.InetSocketAddress;
  27. import java.net.Proxy;
  28. import java.nio.charset.StandardCharsets;
  29. import java.time.LocalDate;
  30. import java.time.LocalDateTime;
  31. import java.util.HashMap;
  32. import java.util.List;
  33. import java.util.Map;
  34. import java.util.UUID;
  35. import java.util.stream.Collectors;
  36. /**
  37. * Crawler LoveFoot ServiceImpl
  38. *
  39. * @author lvzhiqiang
  40. * 2022/10/17 14:47
  41. */
  42. @Service
  43. @Slf4j
  44. public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
  45. @Resource
  46. private DicCodeMapper dicCodeMapper;
  47. @Resource
  48. private CrawlerLoveFootMapper crawlerLoveFootMapper;
  49. @Resource
  50. private VideoSitePoolMapper videoSitePoolMapper;
  51. @Value("${spring.profiles.active}")
  52. private String env;
  53. Map<String, String> footConstantMap = null;
  54. Map<String, String> javbusConstantMap = null;
  55. Map<String, String> javdbConstantMap = null;
  56. List<String> javbusUrlList = null;
  57. Map<String, String> headerMap = new HashMap<>();
  58. Map<String, String> header2Map = new HashMap<>();
  59. Map<String, String> header3Map = new HashMap<>();
  60. Proxy proxy = null;
  61. public void beforeProxy() {
  62. if (null == proxy) {
  63. if ("dev".equals(env)) {
  64. proxy = new Proxy(Proxy.Type.SOCKS, new InetSocketAddress("127.0.0.1", 1080));
  65. } else {
  66. proxy = Proxy.NO_PROXY;
  67. }
  68. }
  69. }
  70. @Async
  71. @Override
  72. @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
  73. public void jsoupLoveFoot4avnoashi(Integer status, Integer isDel, Integer ignoreRetryCount) throws Exception {
  74. log.warn("jsoupFoot4avnoashi 开始:status={},isDel={},ignoreRetryCount={}", status, isDel, ignoreRetryCount);
  75. StopWatch stopWatch = new StopWatch();
  76. stopWatch.start();
  77. if (isDel == 1) {
  78. crawlerLoveFootMapper.deleteAll();
  79. }
  80. List<DicCode> dicCodeList = dicCodeMapper.findAll();
  81. // 获取常量MAP
  82. footConstantMap = dicCodeList.stream()
  83. .filter(x -> "foot".equals(x.getCodeDesc()) && x.getEnv().contains(env))
  84. .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
  85. javbusConstantMap = dicCodeList.stream()
  86. .filter(x -> x.getType() != null && 1 == x.getType() && x.getEnv().contains(env))
  87. .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
  88. // 获取javbus防屏蔽地址
  89. javbusUrlList = videoSitePoolMapper.findUrlByTypeAndDeleteFlag(1, 1);
  90. if (javbusUrlList.size() == 0) {
  91. log.warn("javbusUrlList为空");
  92. return;
  93. }
  94. // 代理及TOKEN设置
  95. beforeProxy();
  96. // 解析原始站点
  97. jsoupLoveFoot4avnoashiSub(status, ignoreRetryCount);
  98. log.warn("jsoupFoot4avnoashi 结束:time={}", stopWatch.getTotalTimeSeconds());
  99. }
  100. @Async
  101. @Override
  102. @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
  103. public void jsoupLoveFoot4jpfoot(Integer status, Integer isDel, Integer ignoreRetryCount) throws Exception {
  104. log.warn("jsoupLoveFoot4jpfoot 开始:status={},isDel={},ignoreRetryCount={}", status, isDel, ignoreRetryCount);
  105. StopWatch stopWatch = new StopWatch();
  106. stopWatch.start();
  107. if (isDel == 1) {
  108. crawlerLoveFootMapper.deleteAll();
  109. }
  110. List<DicCode> dicCodeList = dicCodeMapper.findAll();
  111. // 获取常量MAP
  112. footConstantMap = dicCodeList.stream()
  113. .filter(x -> "foot".equals(x.getCodeDesc()) && x.getEnv().contains(env))
  114. .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
  115. javbusConstantMap = dicCodeList.stream()
  116. .filter(x -> x.getType() != null && 1 == x.getType() && x.getEnv().contains(env))
  117. .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
  118. // 获取javbus防屏蔽地址
  119. javbusUrlList = videoSitePoolMapper.findUrlByTypeAndDeleteFlag(1, 1);
  120. if (javbusUrlList.size() == 0) {
  121. log.warn("javbusUrlList为空");
  122. return;
  123. }
  124. // 代理及TOKEN设置
  125. beforeProxy();
  126. // 解析原始站点
  127. jsoupLoveFoot4jpfootSub(status, ignoreRetryCount);
  128. log.warn("jsoupLoveFoot4jpfoot 结束:time={}", stopWatch.getTotalTimeSeconds());
  129. }
  130. @Async
  131. @Override
  132. @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
  133. public void jsoupLoveFoot4CrawingFail(Integer status, Integer ignoreRetryCount, String website) {
  134. log.warn("jjsoupLoveFoot4CrawingFail 开始");
  135. StopWatch stopWatch = new StopWatch();
  136. stopWatch.start();
  137. // 获取待抓取码列表
  138. List<CrawlerLoveFoot> loveFootList;
  139. if (1 == ignoreRetryCount) {
  140. loveFootList = crawlerLoveFootMapper.findInfoByStatus4IgnoreRetryCount(status);
  141. } else {
  142. loveFootList = crawlerLoveFootMapper.findInfoByStatus(status);
  143. }
  144. if (loveFootList.size() == 0) {
  145. log.warn("loveFootList为空");
  146. return;
  147. }
  148. log.warn("jsoupLoveFoot4CrawingFail loveFootList size={}", loveFootList.size());
  149. List<DicCode> dicCodeList = dicCodeMapper.findAll();
  150. // 获取常量MAP
  151. javbusConstantMap = dicCodeList.stream()
  152. .filter(x -> x.getType() != null && 1 == x.getType() && x.getEnv().contains(env))
  153. .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
  154. javdbConstantMap = dicCodeList.stream()
  155. .filter(x -> x.getType() != null && 2 == x.getType() && x.getEnv().contains(env))
  156. .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
  157. // 获取javbus防屏蔽地址
  158. if ("javbus".equals(website)) {
  159. javbusUrlList = videoSitePoolMapper.findUrlByTypeAndDeleteFlag(1, 1);
  160. if (javbusUrlList.size() == 0) {
  161. log.warn("javbusUrlList为空");
  162. return;
  163. }
  164. }
  165. // 代理及TOKEN设置
  166. beforeProxy();
  167. // 解析原始站点
  168. int successCount = jsoupLoveFoot4CrawingFailSub(loveFootList, website);
  169. log.warn("jjsoupLoveFoot4CrawingFail 结束:totalCount={},successCount={},time={}", loveFootList.size(), successCount, stopWatch.getTotalTimeSeconds());
  170. }
  171. @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
  172. public int jsoupLoveFoot4CrawingFailSub(List<CrawlerLoveFoot> loveFootList, String website) {
  173. int successCount = 0;
  174. for (CrawlerLoveFoot crawlerLoveFoot : loveFootList) {
  175. Document searchDocument = null;
  176. Document codeDocument;
  177. String message = null;
  178. int retryCount = 0;
  179. while (retryCount <= 3) {
  180. long start = System.currentTimeMillis();
  181. String searchUrl = null;
  182. Elements itembSelects = null;
  183. try {
  184. String javbusCodeUrl = null;
  185. if ("javbus".equals(website)) {
  186. String javbusUrl = javbusUrlList.get((int) (0 + Math.random() * (javbusUrlList.size())));
  187. searchUrl = javbusUrl.concat("/search/").concat(crawlerLoveFoot.getName()).concat("&parent=ce");
  188. try {
  189. searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
  190. } catch (Exception ee) {
  191. String newName = crawlerLoveFoot.getName().substring(crawlerLoveFoot.getName().length() / 2);
  192. searchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
  193. try {
  194. searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
  195. } catch (Exception eee) {
  196. newName = newName.substring(newName.length() / 2);
  197. searchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
  198. try {
  199. searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
  200. } catch (Exception eeee) {
  201. // throw new BusinessException(30000, "javbus search result null");
  202. }
  203. }
  204. }
  205. if (null == searchDocument) {
  206. String newName = crawlerLoveFoot.getName().replace("●", "");
  207. searchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
  208. try {
  209. searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
  210. } catch (Exception ee) {
  211. newName = newName.substring(0, newName.length() / 2);
  212. searchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
  213. try {
  214. searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
  215. } catch (Exception eee) {
  216. newName = newName.substring(0, newName.length() / 2);
  217. searchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
  218. try {
  219. searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
  220. } catch (Exception eeee) {
  221. // throw new BusinessException(30000, "javbus search result null");
  222. }
  223. }
  224. }
  225. }
  226. if (null == searchDocument) {
  227. if (crawlerLoveFoot.getIdentificationCode().length() < 32) {
  228. javbusCodeUrl = javbusUrl.concat("/").concat(crawlerLoveFoot.getIdentificationCode());
  229. itembSelects = new Elements();
  230. } else {
  231. throw new BusinessException(30000, "javbus search result null");
  232. }
  233. } else {
  234. itembSelects = searchDocument.select("div#waterfall").select("div.item");
  235. if (itembSelects.size() == 0) {
  236. throw new BusinessException(30000, "javbus search result null");
  237. }
  238. if (crawlerLoveFoot.getIdentificationCode().length() < 32) {
  239. javbusCodeUrl = javbusUrl.concat("/").concat(crawlerLoveFoot.getIdentificationCode());
  240. }
  241. }
  242. } else if ("javdb".equals(website)) {
  243. searchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(crawlerLoveFoot.getName()).concat("&f=all");
  244. header3Map.put("referer", searchUrl);
  245. searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
  246. itembSelects = searchDocument.select("div.movie-list").select("div.item");
  247. if (itembSelects.size() == 0) {
  248. String newName = crawlerLoveFoot.getName().substring(crawlerLoveFoot.getName().length() / 2);
  249. searchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(newName).concat("&f=all");
  250. searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
  251. itembSelects = searchDocument.select("div.movie-list").select("div.item");
  252. }
  253. if (itembSelects.size() == 0) {
  254. throw new BusinessException(30000, "javdb search result null");
  255. }
  256. }
  257. // 获取codeUrl
  258. String codeUrl = null;
  259. String title;
  260. if ("javbus".equals(website)) {
  261. for (Element itembSelect : itembSelects) {
  262. title = itembSelect.select("a.movie-box").get(0).select("div.photo-frame > img").attr("title");
  263. if (title.contains(crawlerLoveFoot.getName())) {
  264. codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
  265. break;
  266. }
  267. String newName = crawlerLoveFoot.getName().replace("●", "さ");
  268. if (title.contains(newName)) {
  269. codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
  270. crawlerLoveFoot.setName(newName);
  271. break;
  272. }
  273. String[] newNameArr = crawlerLoveFoot.getName().split("●");
  274. int matchCount = 0;
  275. for (String s : newNameArr) {
  276. if (title.contains(s)) {
  277. matchCount++;
  278. }
  279. }
  280. if (newNameArr.length == matchCount) {
  281. codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
  282. crawlerLoveFoot.setName(title);
  283. break;
  284. }
  285. }
  286. if (StringUtils.isEmpty(codeUrl) && StringUtils.isNotEmpty(javbusCodeUrl)) {
  287. codeUrl = javbusCodeUrl;
  288. crawlerLoveFoot.setChangeTitleFlag(1);
  289. }
  290. if (StringUtils.isEmpty(codeUrl)) {
  291. throw new BusinessException(30000, "javbus search result mismatch");
  292. }
  293. } else if ("javdb".equals(website)) {
  294. for (Element itembSelect : itembSelects) {
  295. title = itembSelect.select("a.box").get(0).attr("title");
  296. if (title.contains(crawlerLoveFoot.getName())) {
  297. codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
  298. break;
  299. }
  300. String newName = crawlerLoveFoot.getName().replace("●", "さ");
  301. if (title.contains(newName)) {
  302. codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
  303. crawlerLoveFoot.setName(newName);
  304. break;
  305. }
  306. }
  307. if (StringUtils.isEmpty(codeUrl)) {
  308. throw new BusinessException(30000, "javdb search result mismatch");
  309. }
  310. }
  311. // 解析codeUrl
  312. long picTime = 999;
  313. if ("javbus".equals(website)) {
  314. codeDocument = JsoupUtil.requestDocument(codeUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
  315. picTime = parseJavbusCodeDocument(codeDocument, crawlerLoveFoot);
  316. } else if ("javdb".equals(website)) {
  317. codeDocument = JsoupUtil.requestDocument(codeUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
  318. picTime = parseJavdbCodeDocument(codeDocument, crawlerLoveFoot);
  319. crawlerLoveFoot.setJavdbUrl(codeUrl);
  320. }
  321. crawlerLoveFoot.setRetryCount(retryCount);
  322. crawlerLoveFoot.setType(2);
  323. crawlerLoveFoot.setStatus(3);
  324. log.warn("jsoupLoveFoot4CrawingFailSub parseKeywordsToCode success,keywords={},code={},picTime={},time={}", crawlerLoveFoot.getName(), crawlerLoveFoot.getIdentificationCode(), picTime, System.currentTimeMillis() - start);
  325. break;
  326. } catch (Exception e) {
  327. ++retryCount;
  328. if (retryCount < 4) {
  329. log.error("jsoupLoveFoot4CrawingFailSub error重试:,retryCount={},time={},javdbSearchUrl={}", retryCount, System.currentTimeMillis() - start, searchUrl, e);
  330. } else if (retryCount == 4) {
  331. message = e.getMessage().length() <= 200 ? e.getMessage() : e.getMessage().substring(0, 200);
  332. }
  333. if (e instanceof BusinessException) {
  334. message = e.getMessage().length() <= 200 ? e.getMessage() : e.getMessage().substring(0, 200);
  335. break;
  336. }
  337. }
  338. }
  339. if (StringUtils.isNotEmpty(message)) {
  340. CrawlerLoveFoot crawlerLoveFoot2 = new CrawlerLoveFoot();
  341. crawlerLoveFoot2.setId(crawlerLoveFoot.getId());
  342. crawlerLoveFoot2.setFailureCause(message);
  343. crawlerLoveFoot2.setRetryCount(retryCount);
  344. crawlerLoveFootMapper.updateInfoById(crawlerLoveFoot2);
  345. } else {
  346. crawlerLoveFoot.setFailureCause("");
  347. crawlerLoveFootMapper.updateInfoById(crawlerLoveFoot);
  348. successCount++;
  349. }
  350. }
  351. return successCount;
  352. }
  353. private long parseJavdbCodeDocument(Document javdbCodeDocument, CrawlerLoveFoot crawlerLoveFoot) throws IOException {
  354. Elements container = javdbCodeDocument.select("section.section > div.container");
  355. if (container.size() == 0) {
  356. throw new BusinessException(30000, "番号无效!");
  357. }
  358. Elements videoDetail = container.select("div.video-detail");
  359. // 名称
  360. // crawlerLoveFoot.setName(videoDetail.select("h2.title").select("strong.current-title").text().trim());
  361. Elements moviePanelInfos = videoDetail.select("nav.movie-panel-info");
  362. Element pEle = moviePanelInfos.get(0);
  363. // 识别码
  364. String iCode = pEle.select("div:contains(番號)").select("span.value").first().text().replace(" ", "").replaceAll("\\s+", "");
  365. crawlerLoveFoot.setIdentificationCode(iCode);
  366. // 发行日期
  367. String issueDate = pEle.select("div:contains(日期)").select("span.value").first().text().replace(" ", "").replaceAll("\\s+", "");
  368. crawlerLoveFoot.setIssueDate(LocalDate.parse(issueDate, DateUtils.dateFormatter));
  369. // 长度
  370. String length = pEle.select("div:contains(時長)").select("span.value").first().text().replace(" ", "").replaceAll("\\s+", "");
  371. crawlerLoveFoot.setLength(length);
  372. // 导演
  373. Elements directorEles = pEle.select("div:contains(導演)").select("span.value");
  374. if (directorEles.size() > 0) {
  375. crawlerLoveFoot.setDirector(directorEles.first().select("a[href]").first().text().replace(" ", "").replaceAll("\\s+", ""));
  376. }
  377. // 制作商/片商
  378. Elements markerEles = pEle.select("div:contains(片商)").select("span.value");
  379. if (markerEles.size() > 0) {
  380. crawlerLoveFoot.setMaker(markerEles.first().select("a[href]").first().text().replace(" ", "").replaceAll("\\s+", ""));
  381. }
  382. // 发行商
  383. Elements issuerEles = pEle.select("div:contains(發行)").select("span.value");
  384. if (issuerEles.size() > 0) {
  385. crawlerLoveFoot.setIssuer(issuerEles.first().select("a[href]").first().text().replace(" ", "").replaceAll("\\s+", ""));
  386. }
  387. // 类别
  388. Elements genresEles = pEle.select("div:contains(類別)").select("span.value");
  389. if (genresEles.size() > 0) {
  390. StringBuffer sb = new StringBuffer();
  391. Elements ahrefEles = genresEles.first().select("a[href]");
  392. for (Element ahrefEle : ahrefEles) {
  393. sb.append(ahrefEle.text().replace(" ", "").replaceAll("\\s+", "")).append(",");
  394. }
  395. if (sb.length() > 0) {
  396. sb = sb.deleteCharAt(sb.length() - 1);
  397. }
  398. crawlerLoveFoot.setGenres(sb.toString());
  399. }
  400. // 演员
  401. Elements castEles = pEle.select("div:contains(演員)").select("span.value");
  402. if (castEles.size() > 0) {
  403. StringBuffer sb = new StringBuffer();
  404. Elements ahrefEles = castEles.first().select("a[href]");
  405. for (Element ahrefEle : ahrefEles) {
  406. sb.append(ahrefEle.text().replace(" ", "").replaceAll("\\s+", "")).append(",");
  407. }
  408. if (sb.length() > 0) {
  409. sb = sb.deleteCharAt(sb.length() - 1);
  410. }
  411. crawlerLoveFoot.setCast(sb.toString());
  412. }
  413. // 图片URL
  414. Elements videoMetaPanel = videoDetail.select("div.column-video-cover");
  415. String href = videoMetaPanel.select("a > img").first().attr("src");
  416. long start = System.currentTimeMillis();
  417. Connection.Response response = Jsoup.connect(href).method(Connection.Method.GET).ignoreContentType(true).timeout(50 * 1000).execute();
  418. String fileName = issueDate.concat(" ").concat(iCode).concat(" ").concat(crawlerLoveFoot.getName());
  419. byte[] imgUrlBytes = fileName.getBytes(StandardCharsets.UTF_8);
  420. if (imgUrlBytes.length > 251) {
  421. byte[] imgUrlDestBytes = new byte[251];
  422. System.arraycopy(imgUrlBytes, 0, imgUrlDestBytes, 0, 251);
  423. fileName = new String(imgUrlDestBytes, StandardCharsets.UTF_8).replace("�", "");
  424. }
  425. fileName = fileName.concat(".jpg");
  426. String machiImgUrl = "足舐/".concat(fileName);
  427. saveFile(response.bodyStream(), javbusConstantMap.get("apics_path").concat(machiImgUrl));
  428. long end = System.currentTimeMillis();
  429. crawlerLoveFoot.setImgUrl(machiImgUrl);
  430. return end - start;
  431. }
  432. @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
  433. public void jsoupLoveFoot4avnoashiSub(Integer status, Integer ignoreRetryCount) throws Exception {
  434. CrawlerLoveFoot latestLoveFoot = crawlerLoveFootMapper.findLatestInfo4avnoashi();
  435. LocalDate latestDate;
  436. if (latestLoveFoot == null) {
  437. latestDate = LocalDate.of(1970, 1, 1);
  438. } else {
  439. latestDate = latestLoveFoot.getUpdateDate();
  440. }
  441. String avnoashiUrl = footConstantMap.get("avnoashi_url");
  442. headerMap.put("referer", avnoashiUrl);
  443. header2Map.put("referer", avnoashiUrl.concat("?sort=newer"));
  444. Document loveFootDocument;
  445. Document loveFootDetailDocument;
  446. outer:
  447. while (true) {
  448. loveFootDocument = JsoupUtil.requestDocument(avnoashiUrl, JsoupUtil.HTTP_GET, proxy, null, headerMap, null);
  449. log.warn("jsoupLoveFoot4avnoashiSub page success:url={}", avnoashiUrl);
  450. Elements sourceSelects = loveFootDocument.select("div.dividerBottom > div.archive").select("div.archive__contents").select("h2");
  451. for (Element sourceSelect : sourceSelects) {
  452. String sourceUrl = sourceSelect.select("a").attr("abs:href");
  453. Integer statusInt = 2;
  454. Integer typeInt = 1;
  455. LocalDate clockDate = null;
  456. LocalDate updateDate = null;
  457. String keywords = null;
  458. try {
  459. loveFootDetailDocument = JsoupUtil.requestDocument(sourceUrl, JsoupUtil.HTTP_GET, proxy, null, header2Map, null);
  460. String clockDateStr = loveFootDetailDocument.select("div.viral").select("li.icon-clock").text();
  461. String updateDateStr = loveFootDetailDocument.select("div.viral").select("li.icon-update").text();
  462. clockDate = LocalDate.parse(clockDateStr, DateUtils.dateFormatter3);
  463. updateDate = LocalDate.parse(updateDateStr, DateUtils.dateFormatter3);
  464. if (updateDate.isBefore(latestDate) || updateDate.isEqual(latestDate)) {
  465. break outer;
  466. }
  467. // 获取关键词
  468. keywords = loveFootDetailDocument.select("div.postContents").select("td:contains(タイトル)").next("td").text();
  469. if (StringUtils.isNotEmpty(keywords)) {
  470. statusInt = 1;
  471. log.warn("jsoupLoveFoot4avnoashiSub parseDetailToKeywords success,sourceUrl={},keywords={}", sourceUrl, keywords);
  472. } else {
  473. throw new Exception("keywords is null");
  474. }
  475. // 通过关键词获取识别码
  476. CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
  477. crawlerLoveFoot.setClockDate(clockDate);
  478. crawlerLoveFoot.setUpdateDate(updateDate);
  479. crawlerLoveFoot.setOrginAvnoashiUrl(sourceUrl);
  480. crawlerLoveFoot.setType(2);
  481. crawlerLoveFoot.setStatus(3);
  482. crawlerLoveFoot.setCreateTime(LocalDateTime.now());
  483. String message = parseKeywordsToCode(crawlerLoveFoot, keywords);
  484. if (StringUtils.isNotEmpty(message)) {
  485. statusInt = 4;
  486. throw new Exception(message);
  487. }
  488. crawlerLoveFootMapper.insertOrUpdate4avnoashi(crawlerLoveFoot);
  489. } catch (Exception e) {
  490. log.error("jsoupLoveFoot4avnoashiSub detail fail,sourceUrl={}", sourceUrl, e);
  491. CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
  492. crawlerLoveFoot.setIdentificationCode(UUID.randomUUID().toString());
  493. crawlerLoveFoot.setOrginAvnoashiUrl(sourceUrl);
  494. crawlerLoveFoot.setClockDate(clockDate);
  495. crawlerLoveFoot.setUpdateDate(updateDate);
  496. crawlerLoveFoot.setName(keywords);
  497. crawlerLoveFoot.setType(typeInt);
  498. crawlerLoveFoot.setStatus(statusInt);
  499. crawlerLoveFoot.setCreateTime(LocalDateTime.now());
  500. crawlerLoveFoot.setFailureCause(e.getMessage());
  501. crawlerLoveFootMapper.insertOrUpdate4avnoashi(crawlerLoveFoot);
  502. }
  503. }
  504. // 继续下一页
  505. Elements nextSelects = loveFootDocument.select("ul.pager").select("a:contains(Next)");
  506. if (nextSelects.size() > 0) {
  507. avnoashiUrl = nextSelects.get(0).attr("abs:href");
  508. } else {
  509. break;
  510. }
  511. }
  512. }
  513. @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
  514. public void jsoupLoveFoot4jpfootSub(Integer status, Integer ignoreRetryCount) throws Exception {
  515. CrawlerLoveFoot latestLoveFoot = crawlerLoveFootMapper.findLatestInfo4jpfoot();
  516. LocalDate latestDate;
  517. if (latestLoveFoot == null) {
  518. latestDate = LocalDate.of(1970, 1, 1);
  519. } else {
  520. latestDate = latestLoveFoot.getUpdateDate();
  521. }
  522. String jpfootUrl = footConstantMap.get("jpfoot_url");
  523. headerMap.put("referer", jpfootUrl);
  524. Document loveFootDocument;
  525. Document loveFootDetailDocument;
  526. outer:
  527. while (true) {
  528. loveFootDocument = JsoupUtil.requestDocument(jpfootUrl, JsoupUtil.HTTP_GET, proxy, null, headerMap, null);
  529. log.warn("jsoupLoveFoot4jpfootSub page success:url={}", jpfootUrl);
  530. Elements sourceSelects = loveFootDocument.select("article.mainContainer > div.av_itemGrid").select("article.av_item");
  531. for (Element sourceSelect : sourceSelects) {
  532. Thread.sleep(1000L);
  533. String sourceUrl = sourceSelect.select("a.av_itemLink").attr("abs:href");
  534. Integer statusInt = 2;
  535. Integer typeInt = 1;
  536. LocalDate clockDate = null;
  537. LocalDate updateDate = null;
  538. String keywords = null;
  539. try {
  540. loveFootDetailDocument = JsoupUtil.requestDocument(sourceUrl, JsoupUtil.HTTP_GET, proxy, null, headerMap, null);
  541. String dateStr = loveFootDetailDocument.select("div.avdetail_date").select("span.avdetail_dateText").text();
  542. clockDate = LocalDate.parse(dateStr, DateUtils.dateFormatter4);
  543. updateDate = clockDate;
  544. if (updateDate.isBefore(latestDate) || updateDate.isEqual(latestDate)) {
  545. break outer;
  546. }
  547. // 获取关键词
  548. keywords = loveFootDetailDocument.select("div.avdetail_detailTop").select("p.avdetail_detailTopTitle").text().trim();
  549. if (StringUtils.isNotEmpty(keywords)) {
  550. statusInt = 1;
  551. log.warn("jsoupLoveFoot4jpfootSub parseDetailToKeywords success,sourceUrl={},keywords={}", sourceUrl, keywords);
  552. } else {
  553. throw new Exception("keywords is null");
  554. }
  555. // 通过关键词获取识别码
  556. CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
  557. crawlerLoveFoot.setClockDate(clockDate);
  558. crawlerLoveFoot.setUpdateDate(updateDate);
  559. crawlerLoveFoot.setOrginJpfootUrl(sourceUrl);
  560. crawlerLoveFoot.setType(2);
  561. crawlerLoveFoot.setStatus(3);
  562. crawlerLoveFoot.setCreateTime(LocalDateTime.now());
  563. String message = parseKeywordsToCode(crawlerLoveFoot, keywords);
  564. if (StringUtils.isNotEmpty(message)) {
  565. statusInt = 4;
  566. throw new Exception(message);
  567. }
  568. crawlerLoveFootMapper.insertOrUpdate4jpfoot(crawlerLoveFoot);
  569. } catch (Exception e) {
  570. log.error("jsoupLoveFoot4jpfootSub detail fail,sourceUrl={}", sourceUrl, e);
  571. CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
  572. crawlerLoveFoot.setIdentificationCode(UUID.randomUUID().toString());
  573. crawlerLoveFoot.setOrginJpfootUrl(sourceUrl);
  574. crawlerLoveFoot.setClockDate(clockDate);
  575. crawlerLoveFoot.setUpdateDate(updateDate);
  576. crawlerLoveFoot.setName(keywords);
  577. crawlerLoveFoot.setType(typeInt);
  578. crawlerLoveFoot.setStatus(statusInt);
  579. crawlerLoveFoot.setCreateTime(LocalDateTime.now());
  580. crawlerLoveFoot.setFailureCause(e.getMessage());
  581. crawlerLoveFootMapper.insertOrUpdate4jpfoot(crawlerLoveFoot);
  582. }
  583. }
  584. // 继续下一页
  585. Elements nextSelects = loveFootDocument.select("nav.pagination > div.nav-links").select("a.next");
  586. if (nextSelects.size() > 0) {
  587. jpfootUrl = nextSelects.get(0).attr("abs:href");
  588. } else {
  589. break;
  590. }
  591. }
  592. }
  593. private String parseKeywordsToCode(CrawlerLoveFoot crawlerLoveFoot, String keywords) {
  594. int retryCount = 0;
  595. Document javbusSearchDocument = null;
  596. Document javbusCodeDocument;
  597. String message = null;
  598. while (retryCount <= 3) {
  599. long start = System.currentTimeMillis();
  600. try {
  601. String javbusUrl = javbusUrlList.get((int) (0 + Math.random() * (javbusUrlList.size())));
  602. String javbusSearchUrl = javbusUrl.concat("/search/").concat(keywords).concat("&parent=ce");
  603. try {
  604. javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
  605. } catch (Exception ee) {
  606. String newName = keywords.substring(keywords.length() / 2);
  607. javbusSearchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
  608. try {
  609. javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
  610. } catch (Exception eee) {
  611. newName = newName.substring(newName.length() / 2);
  612. javbusSearchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
  613. try {
  614. javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
  615. } catch (Exception eeee) {
  616. // throw new BusinessException(30000, "javbus search result null");
  617. }
  618. }
  619. }
  620. if (null == javbusSearchDocument) {
  621. String newName = keywords.replace("●", "");
  622. javbusSearchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
  623. try {
  624. javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
  625. } catch (Exception ee) {
  626. newName = newName.substring(0, newName.length() / 2);
  627. javbusSearchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
  628. try {
  629. javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
  630. } catch (Exception eee) {
  631. newName = newName.substring(0, newName.length() / 2);
  632. javbusSearchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
  633. try {
  634. javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
  635. } catch (Exception eeee) {
  636. throw new BusinessException(30000, "javbus search result null");
  637. }
  638. }
  639. }
  640. }
  641. Elements itembSelects = javbusSearchDocument.select("div#waterfall").select("div.item");
  642. if (itembSelects.size() == 0) {
  643. throw new BusinessException(30000, "javbus search result null");
  644. }
  645. // 获取codeUrl
  646. String codeUrl = null;
  647. String title;
  648. for (Element itembSelect : itembSelects) {
  649. title = itembSelect.select("a.movie-box").get(0).select("div.photo-frame > img").attr("title");
  650. if (title.contains(keywords)) {
  651. codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
  652. break;
  653. }
  654. String newName = keywords.replace("●", "さ");
  655. if (title.contains(newName)) {
  656. codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
  657. crawlerLoveFoot.setName(newName);
  658. break;
  659. }
  660. String[] newNameArr = keywords.split("●");
  661. int matchCount = 0;
  662. for (String s : newNameArr) {
  663. if (title.contains(s)) {
  664. matchCount++;
  665. }
  666. }
  667. if (newNameArr.length == matchCount) {
  668. codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
  669. crawlerLoveFoot.setName(title);
  670. break;
  671. }
  672. }
  673. if (StringUtils.isEmpty(codeUrl)) {
  674. throw new BusinessException(30000, "javbus search result mismatch");
  675. }
  676. // 解析codeUrl
  677. javbusCodeDocument = JsoupUtil.requestDocument(codeUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
  678. long picTime = parseJavbusCodeDocument(javbusCodeDocument, crawlerLoveFoot);
  679. crawlerLoveFoot.setRetryCount(retryCount);
  680. log.warn("jsoupLoveFoot4avnoashiSub parseKeywordsToCode success,keywords={},code={},picTime={},time={}", keywords, crawlerLoveFoot.getIdentificationCode(), picTime, System.currentTimeMillis() - start);
  681. break;
  682. } catch (Exception e) {
  683. ++retryCount;
  684. if (retryCount < 4) {
  685. log.error("javbusSearch error重试:,retryCount={},time={},keywords={}", retryCount, System.currentTimeMillis() - start, keywords, e);
  686. } else if (retryCount == 4) {
  687. message = e.getMessage().length() <= 200 ? e.getMessage() : e.getMessage().substring(0, 200);
  688. }
  689. if (e instanceof BusinessException) {
  690. message = e.getMessage().length() <= 200 ? e.getMessage() : e.getMessage().substring(0, 200);
  691. break;
  692. }
  693. }
  694. }
  695. return message;
  696. }
  697. private long parseJavbusCodeDocument(Document document, CrawlerLoveFoot crawlerLoveFoot) throws Exception {
  698. Elements container = document.select("div.container");
  699. if (container.size() == 0) {
  700. throw new BusinessException(30000, "番号无效!");
  701. }
  702. // 名称
  703. String h3 = container.select("h3").first().text();
  704. String[] nameArr = h3.split("\\s+");
  705. if (nameArr.length > 1) {
  706. crawlerLoveFoot.setName(h3.substring(nameArr[0].length()).trim());
  707. } else {
  708. crawlerLoveFoot.setName(nameArr[0]);
  709. }
  710. Elements pEles = container.select("div.info > p");
  711. // 识别码
  712. Element pEle = pEles.get(0);
  713. String iCode = pEle.select("span[style]").first().text();
  714. crawlerLoveFoot.setIdentificationCode(iCode);
  715. // 发行日期
  716. pEle = pEles.get(1);
  717. String issueDate = pEle.text().split(":")[1].replace("\"", "").trim();
  718. crawlerLoveFoot.setIssueDate(LocalDate.parse(issueDate, DateUtils.dateFormatter));
  719. // 长度
  720. pEle = pEles.get(2);
  721. String length = pEle.text().split(":")[1].replace("\"", "").trim();
  722. crawlerLoveFoot.setLength(length);
  723. // 导演
  724. Elements directorEles = container.select("div.info").select("p:contains(導演)");
  725. if (directorEles.size() > 0) {
  726. pEle = directorEles.first().select("a[href]").first();
  727. crawlerLoveFoot.setDirector(pEle.text());
  728. }
  729. // 制作商
  730. Elements markerEles = container.select("div.info").select("p:contains(製作商)");
  731. if (markerEles.size() > 0) {
  732. pEle = markerEles.first().select("a[href]").first();
  733. crawlerLoveFoot.setMaker(pEle.text());
  734. }
  735. // 发行商
  736. Elements issuerEles = container.select("div.info").select("p:contains(發行商)");
  737. if (issuerEles.size() > 0) {
  738. pEle = issuerEles.first().select("a[href]").first();
  739. crawlerLoveFoot.setIssuer(pEle.text());
  740. }
  741. // 类别
  742. Elements genresEles = container.select("div.info").select("p:contains(類別)");
  743. if (genresEles.size() > 0) {
  744. StringBuffer sb = new StringBuffer();
  745. Elements ahrefEles = genresEles.first().nextElementSibling().select("a[href]");
  746. for (Element ahrefEle : ahrefEles) {
  747. sb.append(ahrefEle.text()).append(",");
  748. }
  749. if (sb.length() > 0) {
  750. sb = sb.deleteCharAt(sb.length() - 1);
  751. }
  752. crawlerLoveFoot.setGenres(sb.toString());
  753. }
  754. // 演员
  755. Elements castEles = container.select("div.info").select("p.star-show:contains(演員)");
  756. if (castEles.size() > 0) {
  757. Elements castElesTemp = container.select("div.info:contains(暫無出演者資訊)");
  758. if (castElesTemp.size() == 0) {
  759. StringBuffer sb = new StringBuffer();
  760. Elements ahrefEles = castEles.first().nextElementSibling().nextElementSibling().select("a[href]");
  761. for (Element ahrefEle : ahrefEles) {
  762. sb.append(ahrefEle.text()).append(",");
  763. }
  764. if (sb.length() > 0) {
  765. sb = sb.deleteCharAt(sb.length() - 1);
  766. }
  767. crawlerLoveFoot.setCast(sb.toString());
  768. }
  769. }
  770. // 图片URL
  771. String href = container.select("a.bigImage").first().attr("abs:href");
  772. long start = System.currentTimeMillis();
  773. Connection.Response response = Jsoup.connect(href).method(Connection.Method.GET).ignoreContentType(true).timeout(50 * 1000).execute();
  774. String fileName = issueDate.concat(" ").concat(h3).replace("/", "_");
  775. byte[] imgUrlBytes = fileName.getBytes(StandardCharsets.UTF_8);
  776. if (imgUrlBytes.length > 251) {
  777. byte[] imgUrlDestBytes = new byte[251];
  778. System.arraycopy(imgUrlBytes, 0, imgUrlDestBytes, 0, 251);
  779. fileName = new String(imgUrlDestBytes, StandardCharsets.UTF_8).replace("�", "");
  780. }
  781. fileName = fileName.concat(".jpg");
  782. String machiImgUrl = "足舐/".concat(fileName);
  783. saveFile(response.bodyStream(), javbusConstantMap.get("apics_path").concat(machiImgUrl));
  784. long end = System.currentTimeMillis();
  785. crawlerLoveFoot.setImgUrl(machiImgUrl);
  786. return end - start;
  787. }
  788. /**
  789. * 保存文件到本地
  790. *
  791. * @param bufferedInputStream
  792. * @param savePath
  793. */
  794. private void saveFile(BufferedInputStream bufferedInputStream, String savePath) throws IOException {
  795. //一次最多读取1k
  796. byte[] buffer = new byte[1024];
  797. //实际读取的长度
  798. int readLenghth;
  799. //创建的一个写出的缓冲流
  800. BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(new FileOutputStream(new File(savePath)));
  801. //文件逐步写入本地
  802. while ((readLenghth = bufferedInputStream.read(buffer, 0, 1024)) != -1) {//先读出来,保存在buffer数组中
  803. bufferedOutputStream.write(buffer, 0, readLenghth);//再从buffer中取出来保存到本地
  804. }
  805. //关闭缓冲流
  806. bufferedOutputStream.close();
  807. bufferedInputStream.close();
  808. }
  809. public static void main(String[] args) {
  810. String s = "リア充反対!彼女の目の前で彼氏を拘束、●す鬼畜痴女";
  811. String newName = s.substring(s.length() / 2);
  812. newName = newName.substring(newName.length() / 2);
  813. System.out.println(newName);
  814. }
  815. }