package top.lvzhiqiang.service.impl; import lombok.extern.slf4j.Slf4j; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.springframework.beans.factory.annotation.Value; import org.springframework.scheduling.annotation.Async; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Propagation; import org.springframework.transaction.annotation.Transactional; import org.springframework.util.StopWatch; import top.lvzhiqiang.entity.CrawlerLoveFoot; import top.lvzhiqiang.entity.DicCode; import top.lvzhiqiang.exception.BusinessException; import top.lvzhiqiang.mapper.CrawlerLoveFootMapper; import top.lvzhiqiang.mapper.DicCodeMapper; import top.lvzhiqiang.mapper.VideoSitePoolMapper; import top.lvzhiqiang.service.Crawler4LoveFootService; import top.lvzhiqiang.util.DateUtils; import top.lvzhiqiang.util.JsoupUtil; import top.lvzhiqiang.util.StringUtils; import javax.annotation.Resource; import java.io.*; import java.net.InetSocketAddress; import java.net.Proxy; import java.nio.charset.StandardCharsets; import java.time.LocalDate; import java.time.LocalDateTime; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.UUID; import java.util.stream.Collectors; /** * Crawler LoveFoot ServiceImpl * * @author lvzhiqiang * 2022/10/17 14:47 */ @Service @Slf4j public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService { @Resource private DicCodeMapper dicCodeMapper; @Resource private CrawlerLoveFootMapper crawlerLoveFootMapper; @Resource private VideoSitePoolMapper videoSitePoolMapper; @Value("${spring.profiles.active}") private String env; Map footConstantMap = null; Map javbusConstantMap = null; Map javdbConstantMap = null; List javbusUrlList = null; Map headerMap = new HashMap<>(); Map header2Map = new HashMap<>(); Map header3Map = new HashMap<>(); Proxy proxy = null; public void beforeProxy() { if (null == proxy) { if ("dev".equals(env)) { proxy = new Proxy(Proxy.Type.SOCKS, new InetSocketAddress("127.0.0.1", 1080)); } else { proxy = Proxy.NO_PROXY; } } } @Async @Override @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class) public void jsoupLoveFoot4avnoashi(Integer status, Integer isDel, Integer ignoreRetryCount) throws Exception { log.warn("jsoupFoot4avnoashi 开始:status={},isDel={},ignoreRetryCount={}", status, isDel, ignoreRetryCount); StopWatch stopWatch = new StopWatch(); stopWatch.start(); if (isDel == 1) { crawlerLoveFootMapper.deleteAll(); } List dicCodeList = dicCodeMapper.findAll(); // 获取常量MAP footConstantMap = dicCodeList.stream() .filter(x -> "foot".equals(x.getCodeDesc()) && x.getEnv().contains(env)) .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1)); javbusConstantMap = dicCodeList.stream() .filter(x -> x.getType() != null && 1 == x.getType() && x.getEnv().contains(env)) .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1)); // 获取javbus防屏蔽地址 javbusUrlList = videoSitePoolMapper.findUrlByTypeAndDeleteFlag(1, 1); if (javbusUrlList.size() == 0) { log.warn("javbusUrlList为空"); return; } // 代理及TOKEN设置 beforeProxy(); // 解析原始站点 jsoupLoveFoot4avnoashiSub(status, ignoreRetryCount); log.warn("jsoupFoot4avnoashi 结束:time={}", stopWatch.getTotalTimeSeconds()); } @Async @Override @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class) public void jsoupLoveFoot4CrawingFail(Integer status, Integer ignoreRetryCount) { log.warn("jjsoupLoveFoot4CrawingFail 开始"); StopWatch stopWatch = new StopWatch(); stopWatch.start(); // 获取待抓取码列表 List loveFootList; if (1 == ignoreRetryCount) { loveFootList = crawlerLoveFootMapper.findInfoByStatus4IgnoreRetryCount(status); } else { loveFootList = crawlerLoveFootMapper.findInfoByStatus(status); } if (loveFootList.size() == 0) { log.warn("loveFootList为空"); return; } log.warn("jsoupLoveFoot4CrawingFail loveFootList size={}", loveFootList.size()); List dicCodeList = dicCodeMapper.findAll(); // 获取常量MAP javbusConstantMap = dicCodeList.stream() .filter(x -> x.getType() != null && 1 == x.getType() && x.getEnv().contains(env)) .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1)); javdbConstantMap = dicCodeList.stream() .filter(x -> x.getType() != null && 2 == x.getType() && x.getEnv().contains(env)) .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1)); // 代理及TOKEN设置 beforeProxy(); // 解析原始站点 jsoupLoveFoot4CrawingFailSub(loveFootList); log.warn("jjsoupLoveFoot4CrawingFail 结束:time={}", stopWatch.getTotalTimeSeconds()); } @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class) public void jsoupLoveFoot4CrawingFailSub(List loveFootList) { Document javdbSearchDocument; Document javdbCodeDocument; for (CrawlerLoveFoot crawlerLoveFoot : loveFootList) { String message = null; int retryCount = 0; while (retryCount <= 3) { long start = System.currentTimeMillis(); String javdbSearchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(crawlerLoveFoot.getName()).concat("&f=all"); header3Map.put("referer", javdbSearchUrl); try { javdbSearchDocument = JsoupUtil.requestDocument(javdbSearchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null); Elements itembSelects = javdbSearchDocument.select("div.movie-list").select("div.item"); if (itembSelects.size() == 0) { String newName = crawlerLoveFoot.getName().substring(crawlerLoveFoot.getName().length() / 2); javdbSearchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(newName).concat("&f=all"); javdbSearchDocument = JsoupUtil.requestDocument(javdbSearchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null); itembSelects = javdbSearchDocument.select("div.movie-list").select("div.item"); } if (itembSelects.size() == 0) { throw new BusinessException(30000, "javdb search result null"); } // 获取codeUrl String codeUrl = null; String title; for (Element itembSelect : itembSelects) { title = itembSelect.select("a.box").get(0).attr("title"); if (title.contains(crawlerLoveFoot.getName())) { codeUrl = itembSelect.select("a.box").get(0).attr("abs:href"); break; } String newName = crawlerLoveFoot.getName().replace("●", "さ"); if (title.contains(newName)) { codeUrl = itembSelect.select("a.box").get(0).attr("abs:href"); crawlerLoveFoot.setName(newName); break; } } if (StringUtils.isEmpty(codeUrl)) { throw new BusinessException(30000, "javdb search result mismatch"); } // 解析codeUrl javdbCodeDocument = JsoupUtil.requestDocument(codeUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null); long picTime = parseJavdbCodeDocument(javdbCodeDocument, crawlerLoveFoot); crawlerLoveFoot.setJavdbUrl(codeUrl); crawlerLoveFoot.setRetryCount(retryCount); crawlerLoveFoot.setType(2); crawlerLoveFoot.setStatus(3); log.warn("jsoupLoveFoot4CrawingFailSub parseKeywordsToCode success,keywords={},code={},picTime={},time={}", crawlerLoveFoot.getName(), crawlerLoveFoot.getIdentificationCode(), picTime, System.currentTimeMillis() - start); break; } catch (Exception e) { ++retryCount; if (retryCount < 4) { log.error("jsoupLoveFoot4CrawingFailSub error重试:,retryCount={},time={},javdbSearchUrl={}", retryCount, System.currentTimeMillis() - start, javdbSearchUrl, e); } else if (retryCount == 4) { message = e.getMessage().length() <= 200 ? e.getMessage() : e.getMessage().substring(0, 200); } if (e instanceof BusinessException) { message = e.getMessage().length() <= 200 ? e.getMessage() : e.getMessage().substring(0, 200); break; } } } if (StringUtils.isNotEmpty(message)) { CrawlerLoveFoot crawlerLoveFoot2 = new CrawlerLoveFoot(); crawlerLoveFoot2.setId(crawlerLoveFoot.getId()); crawlerLoveFoot2.setFailureCause(message); crawlerLoveFoot2.setRetryCount(retryCount); crawlerLoveFootMapper.updateInfoById(crawlerLoveFoot2); } else { crawlerLoveFoot.setFailureCause(""); crawlerLoveFootMapper.updateInfoById(crawlerLoveFoot); } } } private long parseJavdbCodeDocument(Document javdbCodeDocument, CrawlerLoveFoot crawlerLoveFoot) throws IOException { Elements container = javdbCodeDocument.select("section.section > div.container"); if (container.size() == 0) { throw new BusinessException(30000, "番号无效!"); } Elements videoDetail = container.select("div.video-detail"); // 名称 // crawlerLoveFoot.setName(videoDetail.select("h2.title").select("strong.current-title").text().trim()); Elements moviePanelInfos = videoDetail.select("nav.movie-panel-info"); Element pEle = moviePanelInfos.get(0); // 识别码 String iCode = pEle.select("div:contains(番號)").select("span.value").first().text().replace(" ", "").replaceAll("\\s+", ""); crawlerLoveFoot.setIdentificationCode(iCode); // 发行日期 String issueDate = pEle.select("div:contains(日期)").select("span.value").first().text().replace(" ", "").replaceAll("\\s+", ""); crawlerLoveFoot.setIssueDate(LocalDate.parse(issueDate, DateUtils.dateFormatter)); // 长度 String length = pEle.select("div:contains(時長)").select("span.value").first().text().replace(" ", "").replaceAll("\\s+", ""); crawlerLoveFoot.setLength(length); // 导演 Elements directorEles = pEle.select("div:contains(導演)").select("span.value"); if (directorEles.size() > 0) { crawlerLoveFoot.setDirector(directorEles.first().select("a[href]").first().text().replace(" ", "").replaceAll("\\s+", "")); } // 制作商/片商 Elements markerEles = pEle.select("div:contains(片商)").select("span.value"); if (markerEles.size() > 0) { crawlerLoveFoot.setMaker(markerEles.first().select("a[href]").first().text().replace(" ", "").replaceAll("\\s+", "")); } // 发行商 Elements issuerEles = pEle.select("div:contains(發行)").select("span.value"); if (issuerEles.size() > 0) { crawlerLoveFoot.setIssuer(issuerEles.first().select("a[href]").first().text().replace(" ", "").replaceAll("\\s+", "")); } // 类别 Elements genresEles = pEle.select("div:contains(類別)").select("span.value"); if (genresEles.size() > 0) { StringBuffer sb = new StringBuffer(); Elements ahrefEles = genresEles.first().select("a[href]"); for (Element ahrefEle : ahrefEles) { sb.append(ahrefEle.text().replace(" ", "").replaceAll("\\s+", "")).append(","); } if (sb.length() > 0) { sb = sb.deleteCharAt(sb.length() - 1); } crawlerLoveFoot.setGenres(sb.toString()); } // 演员 Elements castEles = pEle.select("div:contains(演員)").select("span.value"); if (castEles.size() > 0) { StringBuffer sb = new StringBuffer(); Elements ahrefEles = castEles.first().select("a[href]"); for (Element ahrefEle : ahrefEles) { sb.append(ahrefEle.text().replace(" ", "").replaceAll("\\s+", "")).append(","); } if (sb.length() > 0) { sb = sb.deleteCharAt(sb.length() - 1); } crawlerLoveFoot.setCast(sb.toString()); } // 图片URL Elements videoMetaPanel = videoDetail.select("div.column-video-cover"); String href = videoMetaPanel.select("a > img").first().attr("src"); long start = System.currentTimeMillis(); Connection.Response response = Jsoup.connect(href).method(Connection.Method.GET).ignoreContentType(true).timeout(50 * 1000).execute(); String fileName = issueDate.concat(" ").concat(iCode).concat(" ").concat(crawlerLoveFoot.getName()); byte[] imgUrlBytes = fileName.getBytes(StandardCharsets.UTF_8); if (imgUrlBytes.length > 251) { byte[] imgUrlDestBytes = new byte[251]; System.arraycopy(imgUrlBytes, 0, imgUrlDestBytes, 0, 251); fileName = new String(imgUrlDestBytes, StandardCharsets.UTF_8).replace("�", ""); } fileName = fileName.concat(".jpg"); String machiImgUrl = "足舐/".concat(fileName); saveFile(response.bodyStream(), javbusConstantMap.get("apics_path").concat(machiImgUrl)); long end = System.currentTimeMillis(); crawlerLoveFoot.setImgUrl(machiImgUrl); return end - start; } @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class) public void jsoupLoveFoot4avnoashiSub(Integer status, Integer ignoreRetryCount) throws Exception { CrawlerLoveFoot latestLoveFoot = crawlerLoveFootMapper.findLatestInfo(); LocalDate latestDate; if (latestLoveFoot == null) { latestDate = LocalDate.of(1970, 1, 1); } else { latestDate = latestLoveFoot.getUpdateDate(); } String avnoashiUrl = footConstantMap.get("avnoashi_url"); headerMap.put("referer", avnoashiUrl); header2Map.put("referer", avnoashiUrl.concat("?sort=newer")); Document loveFootDocument; Document loveFootDetailDocument; outer: while (true) { loveFootDocument = JsoupUtil.requestDocument(avnoashiUrl, JsoupUtil.HTTP_GET, proxy, null, headerMap, null); log.warn("jsoupLoveFoot4avnoashiSub page success:url={}", avnoashiUrl); Elements sourceSelects = loveFootDocument.select("div.dividerBottom > div.archive").select("div.archive__contents").select("h2"); for (Element sourceSelect : sourceSelects) { String sourceUrl = sourceSelect.select("a").attr("abs:href"); Integer statusInt = 2; Integer typeInt = 1; LocalDate clockDate = null; LocalDate updateDate = null; String keywords = null; try { loveFootDetailDocument = JsoupUtil.requestDocument(sourceUrl, JsoupUtil.HTTP_GET, proxy, null, header2Map, null); String clockDateStr = loveFootDetailDocument.select("div.viral").select("li.icon-clock").text(); String updateDateStr = loveFootDetailDocument.select("div.viral").select("li.icon-update").text(); clockDate = LocalDate.parse(clockDateStr, DateUtils.dateFormatter3); updateDate = LocalDate.parse(updateDateStr, DateUtils.dateFormatter3); if (updateDate.isBefore(latestDate) || updateDate.isEqual(latestDate)) { break outer; } // 获取关键词 keywords = loveFootDetailDocument.select("div.postContents").select("td:contains(タイトル)").next("td").text(); if (StringUtils.isNotEmpty(keywords)) { statusInt = 1; log.warn("jsoupLoveFoot4avnoashiSub parseDetailToKeywords success,sourceUrl={},keywords={}", sourceUrl, keywords); } else { throw new Exception("keywords is null"); } // 通过关键词获取识别码 CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot(); crawlerLoveFoot.setClockDate(clockDate); crawlerLoveFoot.setUpdateDate(updateDate); crawlerLoveFoot.setOrginUrl(sourceUrl); crawlerLoveFoot.setType(2); crawlerLoveFoot.setStatus(3); String message = parseKeywordsToCode(crawlerLoveFoot, keywords); if (StringUtils.isNotEmpty(message)) { statusInt = 4; throw new Exception(message); } crawlerLoveFootMapper.insertOrUpdate(crawlerLoveFoot); } catch (Exception e) { log.error("jsoupLoveFoot4avnoashiSub detail fail,sourceUrl={}", sourceUrl, e); CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot(); crawlerLoveFoot.setIdentificationCode(UUID.randomUUID().toString()); crawlerLoveFoot.setOrginUrl(sourceUrl); crawlerLoveFoot.setClockDate(clockDate); crawlerLoveFoot.setUpdateDate(updateDate); crawlerLoveFoot.setName(keywords); crawlerLoveFoot.setType(typeInt); crawlerLoveFoot.setStatus(statusInt); crawlerLoveFoot.setCreateTime(LocalDateTime.now()); crawlerLoveFoot.setFailureCause(e.getMessage()); crawlerLoveFootMapper.insertOrUpdate(crawlerLoveFoot); } } // 继续下一页 Elements nextSelects = loveFootDocument.select("ul.pager").select("a:contains(Next)"); if (nextSelects.size() > 0) { avnoashiUrl = nextSelects.get(0).attr("abs:href"); } else { break; } } } private String parseKeywordsToCode(CrawlerLoveFoot crawlerLoveFoot, String keywords) { int retryCount = 0; Document javbusSearchDocument; Document javbusCodeDocument; String message = null; while (retryCount <= 3) { long start = System.currentTimeMillis(); String javbusUrl = javbusUrlList.get((int) (0 + Math.random() * (javbusUrlList.size()))); String javbusSearchUrl = javbusUrl.concat("/search/").concat(keywords).concat("&parent=ce"); try { javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null); Elements itembSelects = javbusSearchDocument.select("div#waterfall").select("div.item"); if (itembSelects.size() == 0) { throw new BusinessException(30000, "javbus search result null"); } // 获取codeUrl String codeUrl = itembSelects.select("a.movie-box").get(0).attr("abs:href"); // 解析codeUrl javbusCodeDocument = JsoupUtil.requestDocument(codeUrl, JsoupUtil.HTTP_GET, proxy, null, null, null); long picTime = parseJavbusCodeDocument(javbusCodeDocument, crawlerLoveFoot); crawlerLoveFoot.setRetryCount(retryCount); log.warn("jsoupLoveFoot4avnoashiSub parseKeywordsToCode success,keywords={},code={},picTime={},time={}", keywords, crawlerLoveFoot.getIdentificationCode(), picTime, System.currentTimeMillis() - start); break; } catch (Exception e) { ++retryCount; if (retryCount < 4) { log.error("javbusSearch error重试:,retryCount={},time={},keywords={}", retryCount, System.currentTimeMillis() - start, keywords, e); } else if (retryCount == 4) { message = e.getMessage().length() <= 200 ? e.getMessage() : e.getMessage().substring(0, 200); } if (e instanceof BusinessException) { message = e.getMessage().length() <= 200 ? e.getMessage() : e.getMessage().substring(0, 200); break; } } } return message; } private long parseJavbusCodeDocument(Document document, CrawlerLoveFoot crawlerLoveFoot) throws Exception { Elements container = document.select("div.container"); if (container.size() == 0) { throw new BusinessException(30000, "番号无效!"); } // 名称 String h3 = container.select("h3").first().text(); String[] nameArr = h3.split("\\s+"); if (nameArr.length > 1) { crawlerLoveFoot.setName(h3.substring(nameArr[0].length()).trim()); } else { crawlerLoveFoot.setName(nameArr[0]); } Elements pEles = container.select("div.info > p"); // 识别码 Element pEle = pEles.get(0); String iCode = pEle.select("span[style]").first().text(); crawlerLoveFoot.setIdentificationCode(iCode); // 发行日期 pEle = pEles.get(1); String issueDate = pEle.text().split(":")[1].replace("\"", "").trim(); crawlerLoveFoot.setIssueDate(LocalDate.parse(issueDate, DateUtils.dateFormatter)); // 长度 pEle = pEles.get(2); String length = pEle.text().split(":")[1].replace("\"", "").trim(); crawlerLoveFoot.setLength(length); // 导演 Elements directorEles = container.select("div.info").select("p:contains(導演)"); if (directorEles.size() > 0) { pEle = directorEles.first().select("a[href]").first(); crawlerLoveFoot.setDirector(pEle.text()); } // 制作商 Elements markerEles = container.select("div.info").select("p:contains(製作商)"); if (markerEles.size() > 0) { pEle = markerEles.first().select("a[href]").first(); crawlerLoveFoot.setMaker(pEle.text()); } // 发行商 Elements issuerEles = container.select("div.info").select("p:contains(發行商)"); if (issuerEles.size() > 0) { pEle = issuerEles.first().select("a[href]").first(); crawlerLoveFoot.setIssuer(pEle.text()); } // 类别 Elements genresEles = container.select("div.info").select("p:contains(類別)"); if (genresEles.size() > 0) { StringBuffer sb = new StringBuffer(); Elements ahrefEles = genresEles.first().nextElementSibling().select("a[href]"); for (Element ahrefEle : ahrefEles) { sb.append(ahrefEle.text()).append(","); } if (sb.length() > 0) { sb = sb.deleteCharAt(sb.length() - 1); } crawlerLoveFoot.setGenres(sb.toString()); } // 演员 Elements castEles = container.select("div.info").select("p.star-show:contains(演員)"); if (castEles.size() > 0) { Elements castElesTemp = container.select("div.info:contains(暫無出演者資訊)"); if (castElesTemp.size() == 0) { StringBuffer sb = new StringBuffer(); Elements ahrefEles = castEles.first().nextElementSibling().nextElementSibling().select("a[href]"); for (Element ahrefEle : ahrefEles) { sb.append(ahrefEle.text()).append(","); } if (sb.length() > 0) { sb = sb.deleteCharAt(sb.length() - 1); } crawlerLoveFoot.setCast(sb.toString()); } } // 图片URL String href = container.select("a.bigImage").first().attr("abs:href"); long start = System.currentTimeMillis(); Connection.Response response = Jsoup.connect(href).method(Connection.Method.GET).ignoreContentType(true).timeout(50 * 1000).execute(); String fileName = issueDate.concat(" ").concat(h3); byte[] imgUrlBytes = fileName.getBytes(StandardCharsets.UTF_8); if (imgUrlBytes.length > 251) { byte[] imgUrlDestBytes = new byte[251]; System.arraycopy(imgUrlBytes, 0, imgUrlDestBytes, 0, 251); fileName = new String(imgUrlDestBytes, StandardCharsets.UTF_8).replace("�", ""); } fileName = fileName.concat(".jpg"); String machiImgUrl = "足舐/".concat(fileName); saveFile(response.bodyStream(), javbusConstantMap.get("apics_path").concat(machiImgUrl)); long end = System.currentTimeMillis(); crawlerLoveFoot.setImgUrl(machiImgUrl); crawlerLoveFoot.setCreateTime(LocalDateTime.now()); return end - start; } /** * 保存文件到本地 * * @param bufferedInputStream * @param savePath */ private void saveFile(BufferedInputStream bufferedInputStream, String savePath) throws IOException { //一次最多读取1k byte[] buffer = new byte[1024]; //实际读取的长度 int readLenghth; //创建的一个写出的缓冲流 BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(new FileOutputStream(new File(savePath))); //文件逐步写入本地 while ((readLenghth = bufferedInputStream.read(buffer, 0, 1024)) != -1) {//先读出来,保存在buffer数组中 bufferedOutputStream.write(buffer, 0, readLenghth);//再从buffer中取出来保存到本地 } //关闭缓冲流 bufferedOutputStream.close(); bufferedInputStream.close(); } public static void main(String[] args) { String s = "嫉妬に狂った愛人のエグい杭打ちピストンにどハマり…都合の良いオンナのはずが快楽沼へ引きずり込まれた僕 七ツ森りり"; String newName = s.substring(s.length() / 2); System.out.println(newName); } }