Crawler4LoveFootServiceImpl.java 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587
  1. package top.lvzhiqiang.service.impl;
  2. import lombok.extern.slf4j.Slf4j;
  3. import org.jsoup.Connection;
  4. import org.jsoup.Jsoup;
  5. import org.jsoup.nodes.Document;
  6. import org.jsoup.nodes.Element;
  7. import org.jsoup.select.Elements;
  8. import org.springframework.beans.factory.annotation.Value;
  9. import org.springframework.scheduling.annotation.Async;
  10. import org.springframework.stereotype.Service;
  11. import org.springframework.transaction.annotation.Propagation;
  12. import org.springframework.transaction.annotation.Transactional;
  13. import org.springframework.util.StopWatch;
  14. import top.lvzhiqiang.entity.CrawlerLoveFoot;
  15. import top.lvzhiqiang.entity.DicCode;
  16. import top.lvzhiqiang.exception.BusinessException;
  17. import top.lvzhiqiang.mapper.CrawlerLoveFootMapper;
  18. import top.lvzhiqiang.mapper.DicCodeMapper;
  19. import top.lvzhiqiang.mapper.VideoSitePoolMapper;
  20. import top.lvzhiqiang.service.Crawler4LoveFootService;
  21. import top.lvzhiqiang.util.DateUtils;
  22. import top.lvzhiqiang.util.JsoupUtil;
  23. import top.lvzhiqiang.util.StringUtils;
  24. import javax.annotation.Resource;
  25. import java.io.*;
  26. import java.net.InetSocketAddress;
  27. import java.net.Proxy;
  28. import java.nio.charset.StandardCharsets;
  29. import java.time.LocalDate;
  30. import java.time.LocalDateTime;
  31. import java.util.HashMap;
  32. import java.util.List;
  33. import java.util.Map;
  34. import java.util.UUID;
  35. import java.util.stream.Collectors;
  36. /**
  37. * Crawler LoveFoot ServiceImpl
  38. *
  39. * @author lvzhiqiang
  40. * 2022/10/17 14:47
  41. */
  42. @Service
  43. @Slf4j
  44. public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
  45. @Resource
  46. private DicCodeMapper dicCodeMapper;
  47. @Resource
  48. private CrawlerLoveFootMapper crawlerLoveFootMapper;
  49. @Resource
  50. private VideoSitePoolMapper videoSitePoolMapper;
  51. @Value("${spring.profiles.active}")
  52. private String env;
  53. Map<String, String> footConstantMap = null;
  54. Map<String, String> javbusConstantMap = null;
  55. Map<String, String> javdbConstantMap = null;
  56. List<String> javbusUrlList = null;
  57. Map<String, String> headerMap = new HashMap<>();
  58. Map<String, String> header2Map = new HashMap<>();
  59. Map<String, String> header3Map = new HashMap<>();
  60. Proxy proxy = null;
  61. public void beforeProxy() {
  62. if (null == proxy) {
  63. if ("dev".equals(env)) {
  64. proxy = new Proxy(Proxy.Type.SOCKS, new InetSocketAddress("127.0.0.1", 1080));
  65. } else {
  66. proxy = Proxy.NO_PROXY;
  67. }
  68. }
  69. }
  70. @Async
  71. @Override
  72. @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
  73. public void jsoupLoveFoot4avnoashi(Integer status, Integer isDel, Integer ignoreRetryCount) throws Exception {
  74. log.warn("jsoupFoot4avnoashi 开始:status={},isDel={},ignoreRetryCount={}", status, isDel, ignoreRetryCount);
  75. StopWatch stopWatch = new StopWatch();
  76. stopWatch.start();
  77. if (isDel == 1) {
  78. crawlerLoveFootMapper.deleteAll();
  79. }
  80. List<DicCode> dicCodeList = dicCodeMapper.findAll();
  81. // 获取常量MAP
  82. footConstantMap = dicCodeList.stream()
  83. .filter(x -> "foot".equals(x.getCodeDesc()) && x.getEnv().contains(env))
  84. .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
  85. javbusConstantMap = dicCodeList.stream()
  86. .filter(x -> x.getType() != null && 1 == x.getType() && x.getEnv().contains(env))
  87. .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
  88. // 获取javbus防屏蔽地址
  89. javbusUrlList = videoSitePoolMapper.findUrlByTypeAndDeleteFlag(1, 1);
  90. if (javbusUrlList.size() == 0) {
  91. log.warn("javbusUrlList为空");
  92. return;
  93. }
  94. // 代理及TOKEN设置
  95. beforeProxy();
  96. // 解析原始站点
  97. jsoupLoveFoot4avnoashiSub(status, ignoreRetryCount);
  98. log.warn("jsoupFoot4avnoashi 结束:time={}", stopWatch.getTotalTimeSeconds());
  99. }
  100. @Async
  101. @Override
  102. @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
  103. public void jsoupLoveFoot4CrawingFail(Integer status, Integer ignoreRetryCount) {
  104. log.warn("jjsoupLoveFoot4CrawingFail 开始");
  105. StopWatch stopWatch = new StopWatch();
  106. stopWatch.start();
  107. // 获取待抓取码列表
  108. List<CrawlerLoveFoot> loveFootList;
  109. if (1 == ignoreRetryCount) {
  110. loveFootList = crawlerLoveFootMapper.findInfoByStatus4IgnoreRetryCount(status);
  111. } else {
  112. loveFootList = crawlerLoveFootMapper.findInfoByStatus(status);
  113. }
  114. if (loveFootList.size() == 0) {
  115. log.warn("loveFootList为空");
  116. return;
  117. }
  118. log.warn("jsoupLoveFoot4CrawingFail loveFootList size={}", loveFootList.size());
  119. List<DicCode> dicCodeList = dicCodeMapper.findAll();
  120. // 获取常量MAP
  121. javbusConstantMap = dicCodeList.stream()
  122. .filter(x -> x.getType() != null && 1 == x.getType() && x.getEnv().contains(env))
  123. .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
  124. javdbConstantMap = dicCodeList.stream()
  125. .filter(x -> x.getType() != null && 2 == x.getType() && x.getEnv().contains(env))
  126. .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
  127. // 代理及TOKEN设置
  128. beforeProxy();
  129. // 解析原始站点
  130. jsoupLoveFoot4CrawingFailSub(loveFootList);
  131. log.warn("jjsoupLoveFoot4CrawingFail 结束:time={}", stopWatch.getTotalTimeSeconds());
  132. }
  133. @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
  134. public void jsoupLoveFoot4CrawingFailSub(List<CrawlerLoveFoot> loveFootList) {
  135. Document javdbSearchDocument;
  136. Document javdbCodeDocument;
  137. for (CrawlerLoveFoot crawlerLoveFoot : loveFootList) {
  138. String message = null;
  139. int retryCount = 0;
  140. while (retryCount <= 3) {
  141. long start = System.currentTimeMillis();
  142. String javdbSearchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(crawlerLoveFoot.getName()).concat("&f=all");
  143. header3Map.put("referer", javdbSearchUrl);
  144. try {
  145. javdbSearchDocument = JsoupUtil.requestDocument(javdbSearchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
  146. Elements itembSelects = javdbSearchDocument.select("div.movie-list").select("div.item");
  147. if (itembSelects.size() == 0) {
  148. String newName = crawlerLoveFoot.getName().substring(crawlerLoveFoot.getName().length() / 2);
  149. javdbSearchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(newName).concat("&f=all");
  150. javdbSearchDocument = JsoupUtil.requestDocument(javdbSearchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
  151. itembSelects = javdbSearchDocument.select("div.movie-list").select("div.item");
  152. }
  153. if (itembSelects.size() == 0) {
  154. throw new BusinessException(30000, "javdb search result null");
  155. }
  156. // 获取codeUrl
  157. String codeUrl = null;
  158. String title;
  159. for (Element itembSelect : itembSelects) {
  160. title = itembSelect.select("a.box").get(0).attr("title");
  161. if (title.contains(crawlerLoveFoot.getName())) {
  162. codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
  163. break;
  164. }
  165. String newName = crawlerLoveFoot.getName().replace("●", "さ");
  166. if (title.contains(newName)) {
  167. codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
  168. crawlerLoveFoot.setName(newName);
  169. break;
  170. }
  171. }
  172. if (StringUtils.isEmpty(codeUrl)) {
  173. throw new BusinessException(30000, "javdb search result mismatch");
  174. }
  175. // 解析codeUrl
  176. javdbCodeDocument = JsoupUtil.requestDocument(codeUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
  177. long picTime = parseJavdbCodeDocument(javdbCodeDocument, crawlerLoveFoot);
  178. crawlerLoveFoot.setJavdbUrl(codeUrl);
  179. crawlerLoveFoot.setRetryCount(retryCount);
  180. crawlerLoveFoot.setType(2);
  181. crawlerLoveFoot.setStatus(3);
  182. log.warn("jsoupLoveFoot4CrawingFailSub parseKeywordsToCode success,keywords={},code={},picTime={},time={}", crawlerLoveFoot.getName(), crawlerLoveFoot.getIdentificationCode(), picTime, System.currentTimeMillis() - start);
  183. break;
  184. } catch (Exception e) {
  185. ++retryCount;
  186. if (retryCount < 4) {
  187. log.error("jsoupLoveFoot4CrawingFailSub error重试:,retryCount={},time={},javdbSearchUrl={}", retryCount, System.currentTimeMillis() - start, javdbSearchUrl, e);
  188. } else if (retryCount == 4) {
  189. message = e.getMessage().length() <= 200 ? e.getMessage() : e.getMessage().substring(0, 200);
  190. }
  191. if (e instanceof BusinessException) {
  192. message = e.getMessage().length() <= 200 ? e.getMessage() : e.getMessage().substring(0, 200);
  193. break;
  194. }
  195. }
  196. }
  197. if (StringUtils.isNotEmpty(message)) {
  198. CrawlerLoveFoot crawlerLoveFoot2 = new CrawlerLoveFoot();
  199. crawlerLoveFoot2.setId(crawlerLoveFoot.getId());
  200. crawlerLoveFoot2.setFailureCause(message);
  201. crawlerLoveFoot2.setRetryCount(retryCount);
  202. crawlerLoveFootMapper.updateInfoById(crawlerLoveFoot2);
  203. } else {
  204. crawlerLoveFoot.setFailureCause("");
  205. crawlerLoveFootMapper.updateInfoById(crawlerLoveFoot);
  206. }
  207. }
  208. }
  209. private long parseJavdbCodeDocument(Document javdbCodeDocument, CrawlerLoveFoot crawlerLoveFoot) throws IOException {
  210. Elements container = javdbCodeDocument.select("section.section > div.container");
  211. if (container.size() == 0) {
  212. throw new BusinessException(30000, "番号无效!");
  213. }
  214. Elements videoDetail = container.select("div.video-detail");
  215. // 名称
  216. // crawlerLoveFoot.setName(videoDetail.select("h2.title").select("strong.current-title").text().trim());
  217. Elements moviePanelInfos = videoDetail.select("nav.movie-panel-info");
  218. Element pEle = moviePanelInfos.get(0);
  219. // 识别码
  220. String iCode = pEle.select("div:contains(番號)").select("span.value").first().text().replace(" ", "").replaceAll("\\s+", "");
  221. crawlerLoveFoot.setIdentificationCode(iCode);
  222. // 发行日期
  223. String issueDate = pEle.select("div:contains(日期)").select("span.value").first().text().replace(" ", "").replaceAll("\\s+", "");
  224. crawlerLoveFoot.setIssueDate(LocalDate.parse(issueDate, DateUtils.dateFormatter));
  225. // 长度
  226. String length = pEle.select("div:contains(時長)").select("span.value").first().text().replace(" ", "").replaceAll("\\s+", "");
  227. crawlerLoveFoot.setLength(length);
  228. // 导演
  229. Elements directorEles = pEle.select("div:contains(導演)").select("span.value");
  230. if (directorEles.size() > 0) {
  231. crawlerLoveFoot.setDirector(directorEles.first().select("a[href]").first().text().replace(" ", "").replaceAll("\\s+", ""));
  232. }
  233. // 制作商/片商
  234. Elements markerEles = pEle.select("div:contains(片商)").select("span.value");
  235. if (markerEles.size() > 0) {
  236. crawlerLoveFoot.setMaker(markerEles.first().select("a[href]").first().text().replace(" ", "").replaceAll("\\s+", ""));
  237. }
  238. // 发行商
  239. Elements issuerEles = pEle.select("div:contains(發行)").select("span.value");
  240. if (issuerEles.size() > 0) {
  241. crawlerLoveFoot.setIssuer(issuerEles.first().select("a[href]").first().text().replace(" ", "").replaceAll("\\s+", ""));
  242. }
  243. // 类别
  244. Elements genresEles = pEle.select("div:contains(類別)").select("span.value");
  245. if (genresEles.size() > 0) {
  246. StringBuffer sb = new StringBuffer();
  247. Elements ahrefEles = genresEles.first().select("a[href]");
  248. for (Element ahrefEle : ahrefEles) {
  249. sb.append(ahrefEle.text().replace(" ", "").replaceAll("\\s+", "")).append(",");
  250. }
  251. if (sb.length() > 0) {
  252. sb = sb.deleteCharAt(sb.length() - 1);
  253. }
  254. crawlerLoveFoot.setGenres(sb.toString());
  255. }
  256. // 演员
  257. Elements castEles = pEle.select("div:contains(演員)").select("span.value");
  258. if (castEles.size() > 0) {
  259. StringBuffer sb = new StringBuffer();
  260. Elements ahrefEles = castEles.first().select("a[href]");
  261. for (Element ahrefEle : ahrefEles) {
  262. sb.append(ahrefEle.text().replace(" ", "").replaceAll("\\s+", "")).append(",");
  263. }
  264. if (sb.length() > 0) {
  265. sb = sb.deleteCharAt(sb.length() - 1);
  266. }
  267. crawlerLoveFoot.setCast(sb.toString());
  268. }
  269. // 图片URL
  270. Elements videoMetaPanel = videoDetail.select("div.column-video-cover");
  271. String href = videoMetaPanel.select("a > img").first().attr("src");
  272. long start = System.currentTimeMillis();
  273. Connection.Response response = Jsoup.connect(href).method(Connection.Method.GET).ignoreContentType(true).timeout(50 * 1000).execute();
  274. String fileName = issueDate.concat(" ").concat(iCode).concat(" ").concat(crawlerLoveFoot.getName());
  275. byte[] imgUrlBytes = fileName.getBytes(StandardCharsets.UTF_8);
  276. if (imgUrlBytes.length > 251) {
  277. byte[] imgUrlDestBytes = new byte[251];
  278. System.arraycopy(imgUrlBytes, 0, imgUrlDestBytes, 0, 251);
  279. fileName = new String(imgUrlDestBytes, StandardCharsets.UTF_8).replace("�", "");
  280. }
  281. fileName = fileName.concat(".jpg");
  282. String machiImgUrl = "足舐/".concat(fileName);
  283. saveFile(response.bodyStream(), javbusConstantMap.get("apics_path").concat(machiImgUrl));
  284. long end = System.currentTimeMillis();
  285. crawlerLoveFoot.setImgUrl(machiImgUrl);
  286. return end - start;
  287. }
  288. @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
  289. public void jsoupLoveFoot4avnoashiSub(Integer status, Integer ignoreRetryCount) throws Exception {
  290. CrawlerLoveFoot latestLoveFoot = crawlerLoveFootMapper.findLatestInfo();
  291. LocalDate latestDate;
  292. if (latestLoveFoot == null) {
  293. latestDate = LocalDate.of(1970, 1, 1);
  294. } else {
  295. latestDate = latestLoveFoot.getUpdateDate();
  296. }
  297. String avnoashiUrl = footConstantMap.get("avnoashi_url");
  298. headerMap.put("referer", avnoashiUrl);
  299. header2Map.put("referer", avnoashiUrl.concat("?sort=newer"));
  300. Document loveFootDocument;
  301. Document loveFootDetailDocument;
  302. outer:
  303. while (true) {
  304. loveFootDocument = JsoupUtil.requestDocument(avnoashiUrl, JsoupUtil.HTTP_GET, proxy, null, headerMap, null);
  305. log.warn("jsoupLoveFoot4avnoashiSub page success:url={}", avnoashiUrl);
  306. Elements sourceSelects = loveFootDocument.select("div.dividerBottom > div.archive").select("div.archive__contents").select("h2");
  307. for (Element sourceSelect : sourceSelects) {
  308. String sourceUrl = sourceSelect.select("a").attr("abs:href");
  309. Integer statusInt = 2;
  310. Integer typeInt = 1;
  311. LocalDate clockDate = null;
  312. LocalDate updateDate = null;
  313. String keywords = null;
  314. try {
  315. loveFootDetailDocument = JsoupUtil.requestDocument(sourceUrl, JsoupUtil.HTTP_GET, proxy, null, header2Map, null);
  316. String clockDateStr = loveFootDetailDocument.select("div.viral").select("li.icon-clock").text();
  317. String updateDateStr = loveFootDetailDocument.select("div.viral").select("li.icon-update").text();
  318. clockDate = LocalDate.parse(clockDateStr, DateUtils.dateFormatter3);
  319. updateDate = LocalDate.parse(updateDateStr, DateUtils.dateFormatter3);
  320. if (updateDate.isBefore(latestDate) || updateDate.isEqual(latestDate)) {
  321. break outer;
  322. }
  323. // 获取关键词
  324. keywords = loveFootDetailDocument.select("div.postContents").select("td:contains(タイトル)").next("td").text();
  325. if (StringUtils.isNotEmpty(keywords)) {
  326. statusInt = 1;
  327. log.warn("jsoupLoveFoot4avnoashiSub parseDetailToKeywords success,sourceUrl={},keywords={}", sourceUrl, keywords);
  328. } else {
  329. throw new Exception("keywords is null");
  330. }
  331. // 通过关键词获取识别码
  332. CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
  333. crawlerLoveFoot.setClockDate(clockDate);
  334. crawlerLoveFoot.setUpdateDate(updateDate);
  335. crawlerLoveFoot.setOrginUrl(sourceUrl);
  336. crawlerLoveFoot.setType(2);
  337. crawlerLoveFoot.setStatus(3);
  338. String message = parseKeywordsToCode(crawlerLoveFoot, keywords);
  339. if (StringUtils.isNotEmpty(message)) {
  340. statusInt = 4;
  341. throw new Exception(message);
  342. }
  343. crawlerLoveFootMapper.insertOrUpdate(crawlerLoveFoot);
  344. } catch (Exception e) {
  345. log.error("jsoupLoveFoot4avnoashiSub detail fail,sourceUrl={}", sourceUrl, e);
  346. CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
  347. crawlerLoveFoot.setIdentificationCode(UUID.randomUUID().toString());
  348. crawlerLoveFoot.setOrginUrl(sourceUrl);
  349. crawlerLoveFoot.setClockDate(clockDate);
  350. crawlerLoveFoot.setUpdateDate(updateDate);
  351. crawlerLoveFoot.setName(keywords);
  352. crawlerLoveFoot.setType(typeInt);
  353. crawlerLoveFoot.setStatus(statusInt);
  354. crawlerLoveFoot.setCreateTime(LocalDateTime.now());
  355. crawlerLoveFoot.setFailureCause(e.getMessage());
  356. crawlerLoveFootMapper.insertOrUpdate(crawlerLoveFoot);
  357. }
  358. }
  359. // 继续下一页
  360. Elements nextSelects = loveFootDocument.select("ul.pager").select("a:contains(Next)");
  361. if (nextSelects.size() > 0) {
  362. avnoashiUrl = nextSelects.get(0).attr("abs:href");
  363. } else {
  364. break;
  365. }
  366. }
  367. }
  368. private String parseKeywordsToCode(CrawlerLoveFoot crawlerLoveFoot, String keywords) {
  369. int retryCount = 0;
  370. Document javbusSearchDocument;
  371. Document javbusCodeDocument;
  372. String message = null;
  373. while (retryCount <= 3) {
  374. long start = System.currentTimeMillis();
  375. String javbusUrl = javbusUrlList.get((int) (0 + Math.random() * (javbusUrlList.size())));
  376. String javbusSearchUrl = javbusUrl.concat("/search/").concat(keywords).concat("&parent=ce");
  377. try {
  378. javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
  379. Elements itembSelects = javbusSearchDocument.select("div#waterfall").select("div.item");
  380. if (itembSelects.size() == 0) {
  381. throw new BusinessException(30000, "javbus search result null");
  382. }
  383. // 获取codeUrl
  384. String codeUrl = itembSelects.select("a.movie-box").get(0).attr("abs:href");
  385. // 解析codeUrl
  386. javbusCodeDocument = JsoupUtil.requestDocument(codeUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
  387. long picTime = parseJavbusCodeDocument(javbusCodeDocument, crawlerLoveFoot);
  388. crawlerLoveFoot.setRetryCount(retryCount);
  389. log.warn("jsoupLoveFoot4avnoashiSub parseKeywordsToCode success,keywords={},code={},picTime={},time={}", keywords, crawlerLoveFoot.getIdentificationCode(), picTime, System.currentTimeMillis() - start);
  390. break;
  391. } catch (Exception e) {
  392. ++retryCount;
  393. if (retryCount < 4) {
  394. log.error("javbusSearch error重试:,retryCount={},time={},keywords={}", retryCount, System.currentTimeMillis() - start, keywords, e);
  395. } else if (retryCount == 4) {
  396. message = e.getMessage().length() <= 200 ? e.getMessage() : e.getMessage().substring(0, 200);
  397. }
  398. if (e instanceof BusinessException) {
  399. message = e.getMessage().length() <= 200 ? e.getMessage() : e.getMessage().substring(0, 200);
  400. break;
  401. }
  402. }
  403. }
  404. return message;
  405. }
  406. private long parseJavbusCodeDocument(Document document, CrawlerLoveFoot crawlerLoveFoot) throws Exception {
  407. Elements container = document.select("div.container");
  408. if (container.size() == 0) {
  409. throw new BusinessException(30000, "番号无效!");
  410. }
  411. // 名称
  412. String h3 = container.select("h3").first().text();
  413. String[] nameArr = h3.split("\\s+");
  414. if (nameArr.length > 1) {
  415. crawlerLoveFoot.setName(h3.substring(nameArr[0].length()).trim());
  416. } else {
  417. crawlerLoveFoot.setName(nameArr[0]);
  418. }
  419. Elements pEles = container.select("div.info > p");
  420. // 识别码
  421. Element pEle = pEles.get(0);
  422. String iCode = pEle.select("span[style]").first().text();
  423. crawlerLoveFoot.setIdentificationCode(iCode);
  424. // 发行日期
  425. pEle = pEles.get(1);
  426. String issueDate = pEle.text().split(":")[1].replace("\"", "").trim();
  427. crawlerLoveFoot.setIssueDate(LocalDate.parse(issueDate, DateUtils.dateFormatter));
  428. // 长度
  429. pEle = pEles.get(2);
  430. String length = pEle.text().split(":")[1].replace("\"", "").trim();
  431. crawlerLoveFoot.setLength(length);
  432. // 导演
  433. Elements directorEles = container.select("div.info").select("p:contains(導演)");
  434. if (directorEles.size() > 0) {
  435. pEle = directorEles.first().select("a[href]").first();
  436. crawlerLoveFoot.setDirector(pEle.text());
  437. }
  438. // 制作商
  439. Elements markerEles = container.select("div.info").select("p:contains(製作商)");
  440. if (markerEles.size() > 0) {
  441. pEle = markerEles.first().select("a[href]").first();
  442. crawlerLoveFoot.setMaker(pEle.text());
  443. }
  444. // 发行商
  445. Elements issuerEles = container.select("div.info").select("p:contains(發行商)");
  446. if (issuerEles.size() > 0) {
  447. pEle = issuerEles.first().select("a[href]").first();
  448. crawlerLoveFoot.setIssuer(pEle.text());
  449. }
  450. // 类别
  451. Elements genresEles = container.select("div.info").select("p:contains(類別)");
  452. if (genresEles.size() > 0) {
  453. StringBuffer sb = new StringBuffer();
  454. Elements ahrefEles = genresEles.first().nextElementSibling().select("a[href]");
  455. for (Element ahrefEle : ahrefEles) {
  456. sb.append(ahrefEle.text()).append(",");
  457. }
  458. if (sb.length() > 0) {
  459. sb = sb.deleteCharAt(sb.length() - 1);
  460. }
  461. crawlerLoveFoot.setGenres(sb.toString());
  462. }
  463. // 演员
  464. Elements castEles = container.select("div.info").select("p.star-show:contains(演員)");
  465. if (castEles.size() > 0) {
  466. Elements castElesTemp = container.select("div.info:contains(暫無出演者資訊)");
  467. if (castElesTemp.size() == 0) {
  468. StringBuffer sb = new StringBuffer();
  469. Elements ahrefEles = castEles.first().nextElementSibling().nextElementSibling().select("a[href]");
  470. for (Element ahrefEle : ahrefEles) {
  471. sb.append(ahrefEle.text()).append(",");
  472. }
  473. if (sb.length() > 0) {
  474. sb = sb.deleteCharAt(sb.length() - 1);
  475. }
  476. crawlerLoveFoot.setCast(sb.toString());
  477. }
  478. }
  479. // 图片URL
  480. String href = container.select("a.bigImage").first().attr("abs:href");
  481. long start = System.currentTimeMillis();
  482. Connection.Response response = Jsoup.connect(href).method(Connection.Method.GET).ignoreContentType(true).timeout(50 * 1000).execute();
  483. String fileName = issueDate.concat(" ").concat(h3);
  484. byte[] imgUrlBytes = fileName.getBytes(StandardCharsets.UTF_8);
  485. if (imgUrlBytes.length > 251) {
  486. byte[] imgUrlDestBytes = new byte[251];
  487. System.arraycopy(imgUrlBytes, 0, imgUrlDestBytes, 0, 251);
  488. fileName = new String(imgUrlDestBytes, StandardCharsets.UTF_8).replace("�", "");
  489. }
  490. fileName = fileName.concat(".jpg");
  491. String machiImgUrl = "足舐/".concat(fileName);
  492. saveFile(response.bodyStream(), javbusConstantMap.get("apics_path").concat(machiImgUrl));
  493. long end = System.currentTimeMillis();
  494. crawlerLoveFoot.setImgUrl(machiImgUrl);
  495. crawlerLoveFoot.setCreateTime(LocalDateTime.now());
  496. return end - start;
  497. }
  498. /**
  499. * 保存文件到本地
  500. *
  501. * @param bufferedInputStream
  502. * @param savePath
  503. */
  504. private void saveFile(BufferedInputStream bufferedInputStream, String savePath) throws IOException {
  505. //一次最多读取1k
  506. byte[] buffer = new byte[1024];
  507. //实际读取的长度
  508. int readLenghth;
  509. //创建的一个写出的缓冲流
  510. BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(new FileOutputStream(new File(savePath)));
  511. //文件逐步写入本地
  512. while ((readLenghth = bufferedInputStream.read(buffer, 0, 1024)) != -1) {//先读出来,保存在buffer数组中
  513. bufferedOutputStream.write(buffer, 0, readLenghth);//再从buffer中取出来保存到本地
  514. }
  515. //关闭缓冲流
  516. bufferedOutputStream.close();
  517. bufferedInputStream.close();
  518. }
  519. public static void main(String[] args) {
  520. String s = "嫉妬に狂った愛人のエグい杭打ちピストンにどハマり…都合の良いオンナのはずが快楽沼へ引きずり込まれた僕 七ツ森りり";
  521. String newName = s.substring(s.length() / 2);
  522. System.out.println(newName);
  523. }
  524. }