|
|
@@ -0,0 +1,359 @@
|
|
|
+package top.lvzhiqiang.service.impl;
|
|
|
+
|
|
|
+import lombok.extern.slf4j.Slf4j;
|
|
|
+import org.jsoup.Connection;
|
|
|
+import org.jsoup.Jsoup;
|
|
|
+import org.jsoup.nodes.Document;
|
|
|
+import org.jsoup.nodes.Element;
|
|
|
+import org.jsoup.select.Elements;
|
|
|
+import org.springframework.beans.factory.annotation.Value;
|
|
|
+import org.springframework.scheduling.annotation.Async;
|
|
|
+import org.springframework.stereotype.Service;
|
|
|
+import org.springframework.transaction.annotation.Propagation;
|
|
|
+import org.springframework.transaction.annotation.Transactional;
|
|
|
+import org.springframework.util.StopWatch;
|
|
|
+import top.lvzhiqiang.entity.CrawlerLoveFoot;
|
|
|
+import top.lvzhiqiang.entity.DicCode;
|
|
|
+import top.lvzhiqiang.exception.BusinessException;
|
|
|
+import top.lvzhiqiang.mapper.CrawlerLoveFootMapper;
|
|
|
+import top.lvzhiqiang.mapper.DicCodeMapper;
|
|
|
+import top.lvzhiqiang.mapper.VideoSitePoolMapper;
|
|
|
+import top.lvzhiqiang.service.Crawler4LoveFootService;
|
|
|
+import top.lvzhiqiang.util.DateUtils;
|
|
|
+import top.lvzhiqiang.util.JsoupUtil;
|
|
|
+import top.lvzhiqiang.util.StringUtils;
|
|
|
+
|
|
|
+import javax.annotation.Resource;
|
|
|
+import java.io.*;
|
|
|
+import java.net.InetSocketAddress;
|
|
|
+import java.net.Proxy;
|
|
|
+import java.nio.charset.StandardCharsets;
|
|
|
+import java.time.LocalDate;
|
|
|
+import java.time.LocalDateTime;
|
|
|
+import java.util.HashMap;
|
|
|
+import java.util.List;
|
|
|
+import java.util.Map;
|
|
|
+import java.util.UUID;
|
|
|
+import java.util.stream.Collectors;
|
|
|
+
|
|
|
+/**
|
|
|
+ * Crawler LoveFoot ServiceImpl
|
|
|
+ *
|
|
|
+ * @author lvzhiqiang
|
|
|
+ * 2022/10/17 14:47
|
|
|
+ */
|
|
|
+@Service
|
|
|
+@Slf4j
|
|
|
+public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
+
|
|
|
+ @Resource
|
|
|
+ private DicCodeMapper dicCodeMapper;
|
|
|
+ @Resource
|
|
|
+ private CrawlerLoveFootMapper crawlerLoveFootMapper;
|
|
|
+ @Resource
|
|
|
+ private VideoSitePoolMapper videoSitePoolMapper;
|
|
|
+ @Value("${spring.profiles.active}")
|
|
|
+ private String env;
|
|
|
+
|
|
|
+ Map<String, String> footConstantMap = null;
|
|
|
+ Map<String, String> javbusConstantMap = null;
|
|
|
+ List<String> javbusUrlList = null;
|
|
|
+ Map<String, String> headerMap = new HashMap<>();
|
|
|
+ Map<String, String> header2Map = new HashMap<>();
|
|
|
+ Proxy proxy = null;
|
|
|
+
|
|
|
+ public void beforeProxy() {
|
|
|
+ if (null == proxy) {
|
|
|
+ if ("dev".equals(env)) {
|
|
|
+ proxy = new Proxy(Proxy.Type.SOCKS, new InetSocketAddress("127.0.0.1", 1080));
|
|
|
+ } else {
|
|
|
+ proxy = Proxy.NO_PROXY;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ @Async
|
|
|
+ @Override
|
|
|
+ @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
|
|
|
+ public void jsoupLoveFoot4avnoashi(Integer status, Integer isDel, Integer ignoreRetryCount) throws Exception {
|
|
|
+ log.warn("jsoupFoot4avnoashi 开始:status={},isDel={},ignoreRetryCount={}", status, isDel, ignoreRetryCount);
|
|
|
+ StopWatch stopWatch = new StopWatch();
|
|
|
+ stopWatch.start();
|
|
|
+ if (isDel == 1) {
|
|
|
+ crawlerLoveFootMapper.deleteAll();
|
|
|
+ }
|
|
|
+
|
|
|
+ List<DicCode> dicCodeList = dicCodeMapper.findAll();
|
|
|
+ // 获取常量MAP
|
|
|
+ footConstantMap = dicCodeList.stream()
|
|
|
+ .filter(x -> "foot".equals(x.getCodeDesc()) && x.getEnv().contains(env))
|
|
|
+ .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
|
|
|
+ javbusConstantMap = dicCodeList.stream()
|
|
|
+ .filter(x -> x.getType() != null && 1 == x.getType() && x.getEnv().contains(env))
|
|
|
+ .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
|
|
|
+ // 获取javbus防屏蔽地址
|
|
|
+ javbusUrlList = videoSitePoolMapper.findUrlByTypeAndDeleteFlag(1, 1);
|
|
|
+ if (javbusUrlList.size() == 0) {
|
|
|
+ log.warn("javbusUrlList为空");
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ // 代理及TOKEN设置
|
|
|
+ beforeProxy();
|
|
|
+ // 解析原始站点
|
|
|
+ jsoupLoveFoot4avnoashiSub(status, ignoreRetryCount);
|
|
|
+ log.warn("jsoupFoot4avnoashi 结束:time={}", stopWatch.getTotalTimeSeconds());
|
|
|
+ }
|
|
|
+
|
|
|
+ @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
|
|
|
+ public void jsoupLoveFoot4avnoashiSub(Integer status, Integer ignoreRetryCount) throws Exception {
|
|
|
+ CrawlerLoveFoot latestLoveFoot = crawlerLoveFootMapper.findLatestInfo();
|
|
|
+
|
|
|
+ LocalDate latestDate;
|
|
|
+ if (latestLoveFoot == null) {
|
|
|
+ latestDate = LocalDate.of(1970, 1, 1);
|
|
|
+ } else {
|
|
|
+ latestDate = latestLoveFoot.getUpdateDate();
|
|
|
+ }
|
|
|
+
|
|
|
+ String avnoashiUrl = footConstantMap.get("avnoashi_url");
|
|
|
+ headerMap.put("referer", avnoashiUrl);
|
|
|
+ header2Map.put("referer", avnoashiUrl.concat("?sort=newer"));
|
|
|
+ Document loveFootDocument;
|
|
|
+ Document loveFootDetailDocument;
|
|
|
+ outer:
|
|
|
+ while (true) {
|
|
|
+ loveFootDocument = JsoupUtil.requestDocument(avnoashiUrl, JsoupUtil.HTTP_GET, proxy, null, headerMap, null);
|
|
|
+ log.warn("jsoupLoveFoot4avnoashiSub page success:url={}", avnoashiUrl);
|
|
|
+
|
|
|
+ Elements sourceSelects = loveFootDocument.select("div.dividerBottom > div.archive").select("div.archive__contents").select("h2");
|
|
|
+ for (Element sourceSelect : sourceSelects) {
|
|
|
+ String sourceUrl = sourceSelect.select("a").attr("abs:href");
|
|
|
+
|
|
|
+ Integer statusInt = 2;
|
|
|
+ try {
|
|
|
+ loveFootDetailDocument = JsoupUtil.requestDocument(sourceUrl, JsoupUtil.HTTP_GET, proxy, null, header2Map, null);
|
|
|
+ String clockDateStr = loveFootDetailDocument.select("div.viral").select("li.icon-clock").text();
|
|
|
+ String updateDateStr = loveFootDetailDocument.select("div.viral").select("li.icon-update").text();
|
|
|
+ LocalDate clockDate = LocalDate.parse(clockDateStr, DateUtils.dateFormatter3);
|
|
|
+ LocalDate updateDate = LocalDate.parse(updateDateStr, DateUtils.dateFormatter3);
|
|
|
+
|
|
|
+ if (updateDate.isBefore(latestDate) || updateDate.isEqual(latestDate)) {
|
|
|
+ break outer;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 获取关键词
|
|
|
+ String keywords = loveFootDetailDocument.select("div.postContents").select("td:contains(タイトル)").next("td").text();
|
|
|
+ if (StringUtils.isNotEmpty(keywords)) {
|
|
|
+ statusInt = 1;
|
|
|
+ log.warn("jsoupLoveFoot4avnoashiSub parseDetailToKeywords success,sourceUrl={},keywords={}", sourceUrl, keywords);
|
|
|
+ } else {
|
|
|
+ throw new Exception("keywords is null");
|
|
|
+ }
|
|
|
+
|
|
|
+ // 通过关键词获取识别码
|
|
|
+ CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
|
|
|
+ crawlerLoveFoot.setClockDate(clockDate);
|
|
|
+ crawlerLoveFoot.setUpdateDate(updateDate);
|
|
|
+ crawlerLoveFoot.setOrginUrl(sourceUrl);
|
|
|
+ crawlerLoveFoot.setType(1);
|
|
|
+ crawlerLoveFoot.setStatus(3);
|
|
|
+ String message = parseKeywordsToCode(crawlerLoveFoot, keywords);
|
|
|
+ if (StringUtils.isNotEmpty(message)) {
|
|
|
+ statusInt = 4;
|
|
|
+ throw new Exception(message);
|
|
|
+ }
|
|
|
+
|
|
|
+ crawlerLoveFootMapper.insertOrUpdate(crawlerLoveFoot);
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.error("jsoupLoveFoot4avnoashiSub detail fail,sourceUrl={}", sourceUrl, e);
|
|
|
+ CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
|
|
|
+ crawlerLoveFoot.setIdentificationCode(UUID.randomUUID().toString());
|
|
|
+ crawlerLoveFoot.setOrginUrl(sourceUrl);
|
|
|
+ crawlerLoveFoot.setType(1);
|
|
|
+ crawlerLoveFoot.setStatus(statusInt);
|
|
|
+ crawlerLoveFoot.setCreateTime(LocalDateTime.now());
|
|
|
+ crawlerLoveFoot.setFailureCause(e.getMessage());
|
|
|
+ crawlerLoveFootMapper.insertOrUpdate(crawlerLoveFoot);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 继续下一页
|
|
|
+ Elements nextSelects = loveFootDocument.select("ul.pager").select("a:contains(Next)");
|
|
|
+ if (nextSelects.size() > 0) {
|
|
|
+ avnoashiUrl = nextSelects.get(0).attr("abs:href");
|
|
|
+ } else {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private String parseKeywordsToCode(CrawlerLoveFoot crawlerLoveFoot, String keywords) {
|
|
|
+ int retryCount = 0;
|
|
|
+ Document javbusSearchDocument;
|
|
|
+ Document javbusCodeDocument;
|
|
|
+ String message = null;
|
|
|
+ while (retryCount <= 3) {
|
|
|
+ long start = System.currentTimeMillis();
|
|
|
+ String javbusUrl = javbusUrlList.get((int) (0 + Math.random() * (javbusUrlList.size())));
|
|
|
+ String javbusSearchUrl = javbusUrl.concat("/search/").concat(keywords).concat("&parent=ce");
|
|
|
+ try {
|
|
|
+ javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
|
|
|
+
|
|
|
+ Elements itembSelects = javbusSearchDocument.select("div#waterfall").select("div.item");
|
|
|
+ if (itembSelects.size() == 0) {
|
|
|
+ throw new BusinessException(30000, "search result null");
|
|
|
+ }
|
|
|
+
|
|
|
+ // 获取codeUrl
|
|
|
+ String codeUrl = itembSelects.select("a.movie-box").get(0).attr("abs:href");
|
|
|
+ // 解析codeUrl
|
|
|
+ javbusCodeDocument = JsoupUtil.requestDocument(codeUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
|
|
|
+ long picTime = parseJavbusCodeDocument(javbusCodeDocument, crawlerLoveFoot);
|
|
|
+
|
|
|
+ crawlerLoveFoot.setRetryCount(retryCount);
|
|
|
+ log.warn("jsoupLoveFoot4avnoashiSub parseKeywordsToCode success,keywords={},code={},picTime={},time={}", keywords, crawlerLoveFoot.getIdentificationCode(), picTime, System.currentTimeMillis() - start);
|
|
|
+
|
|
|
+ break;
|
|
|
+ } catch (Exception e) {
|
|
|
+ ++retryCount;
|
|
|
+
|
|
|
+ if (retryCount < 4) {
|
|
|
+ log.error("javbusSearch error重试:,retryCount={},time={},keywords={}", retryCount, System.currentTimeMillis() - start, keywords, e);
|
|
|
+ } else if (retryCount == 4) {
|
|
|
+ message = e.getMessage().length() <= 200 ? e.getMessage() : e.getMessage().substring(0, 200);
|
|
|
+ }
|
|
|
+
|
|
|
+ if (e instanceof BusinessException) {
|
|
|
+ message = e.getMessage().length() <= 200 ? e.getMessage() : e.getMessage().substring(0, 200);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return message;
|
|
|
+ }
|
|
|
+
|
|
|
+ private long parseJavbusCodeDocument(Document document, CrawlerLoveFoot crawlerLoveFoot) throws Exception {
|
|
|
+ Elements container = document.select("div.container");
|
|
|
+ if (container.size() == 0) {
|
|
|
+ throw new BusinessException(30000, "番号无效!");
|
|
|
+ }
|
|
|
+
|
|
|
+ // 名称
|
|
|
+ String h3 = container.select("h3").first().text();
|
|
|
+ String[] nameArr = h3.split("\\s+");
|
|
|
+ if (nameArr.length > 1) {
|
|
|
+ crawlerLoveFoot.setName(h3.substring(nameArr[0].length()).trim());
|
|
|
+ } else {
|
|
|
+ crawlerLoveFoot.setName(nameArr[0]);
|
|
|
+ }
|
|
|
+
|
|
|
+ Elements pEles = container.select("div.info > p");
|
|
|
+ // 识别码
|
|
|
+ Element pEle = pEles.get(0);
|
|
|
+ String iCode = pEle.select("span[style]").first().text();
|
|
|
+ crawlerLoveFoot.setIdentificationCode(iCode);
|
|
|
+ // 发行日期
|
|
|
+ pEle = pEles.get(1);
|
|
|
+ String issueDate = pEle.text().split(":")[1].replace("\"", "").trim();
|
|
|
+ crawlerLoveFoot.setIssueDate(LocalDate.parse(issueDate, DateUtils.dateFormatter));
|
|
|
+ // 长度
|
|
|
+ pEle = pEles.get(2);
|
|
|
+ String length = pEle.text().split(":")[1].replace("\"", "").trim();
|
|
|
+ crawlerLoveFoot.setLength(length);
|
|
|
+ // 导演
|
|
|
+ Elements directorEles = container.select("div.info").select("p:contains(導演)");
|
|
|
+ if (directorEles.size() > 0) {
|
|
|
+ pEle = directorEles.first().select("a[href]").first();
|
|
|
+ crawlerLoveFoot.setDirector(pEle.text());
|
|
|
+ }
|
|
|
+ // 制作商
|
|
|
+ Elements markerEles = container.select("div.info").select("p:contains(製作商)");
|
|
|
+ if (markerEles.size() > 0) {
|
|
|
+ pEle = markerEles.first().select("a[href]").first();
|
|
|
+ crawlerLoveFoot.setMaker(pEle.text());
|
|
|
+ }
|
|
|
+ // 发行商
|
|
|
+ Elements issuerEles = container.select("div.info").select("p:contains(發行商)");
|
|
|
+ if (issuerEles.size() > 0) {
|
|
|
+ pEle = issuerEles.first().select("a[href]").first();
|
|
|
+ crawlerLoveFoot.setIssuer(pEle.text());
|
|
|
+ }
|
|
|
+ // 类别
|
|
|
+ Elements genresEles = container.select("div.info").select("p:contains(類別)");
|
|
|
+ if (genresEles.size() > 0) {
|
|
|
+ StringBuffer sb = new StringBuffer();
|
|
|
+ Elements ahrefEles = genresEles.first().nextElementSibling().select("a[href]");
|
|
|
+ for (Element ahrefEle : ahrefEles) {
|
|
|
+ sb.append(ahrefEle.text()).append(",");
|
|
|
+ }
|
|
|
+ if (sb.length() > 0) {
|
|
|
+ sb = sb.deleteCharAt(sb.length() - 1);
|
|
|
+ }
|
|
|
+ crawlerLoveFoot.setGenres(sb.toString());
|
|
|
+ }
|
|
|
+ // 演员
|
|
|
+ Elements castEles = container.select("div.info").select("p.star-show:contains(演員)");
|
|
|
+ if (castEles.size() > 0) {
|
|
|
+ Elements castElesTemp = container.select("div.info:contains(暫無出演者資訊)");
|
|
|
+ if (castElesTemp.size() == 0) {
|
|
|
+ StringBuffer sb = new StringBuffer();
|
|
|
+ Elements ahrefEles = castEles.first().nextElementSibling().nextElementSibling().select("a[href]");
|
|
|
+ for (Element ahrefEle : ahrefEles) {
|
|
|
+ sb.append(ahrefEle.text()).append(",");
|
|
|
+ }
|
|
|
+ if (sb.length() > 0) {
|
|
|
+ sb = sb.deleteCharAt(sb.length() - 1);
|
|
|
+ }
|
|
|
+ crawlerLoveFoot.setCast(sb.toString());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ // 图片URL
|
|
|
+ String href = container.select("a.bigImage").first().attr("abs:href");
|
|
|
+
|
|
|
+ long start = System.currentTimeMillis();
|
|
|
+ Connection.Response response = Jsoup.connect(href).method(Connection.Method.GET).ignoreContentType(true).timeout(50 * 1000).execute();
|
|
|
+
|
|
|
+ String fileName = issueDate.concat(" ").concat(h3);
|
|
|
+ byte[] imgUrlBytes = fileName.getBytes(StandardCharsets.UTF_8);
|
|
|
+ if (imgUrlBytes.length > 251) {
|
|
|
+ byte[] imgUrlDestBytes = new byte[251];
|
|
|
+ System.arraycopy(imgUrlBytes, 0, imgUrlDestBytes, 0, 251);
|
|
|
+ fileName = new String(imgUrlDestBytes, StandardCharsets.UTF_8).replace("�", "");
|
|
|
+ }
|
|
|
+ fileName = fileName.concat(".jpg");
|
|
|
+
|
|
|
+ String machiImgUrl = "足舐/".concat(fileName);
|
|
|
+
|
|
|
+ saveFile(response.bodyStream(), javbusConstantMap.get("apics_path").concat(machiImgUrl));
|
|
|
+ long end = System.currentTimeMillis();
|
|
|
+
|
|
|
+ crawlerLoveFoot.setImgUrl(machiImgUrl);
|
|
|
+ crawlerLoveFoot.setCreateTime(LocalDateTime.now());
|
|
|
+
|
|
|
+ return end - start;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 保存文件到本地
|
|
|
+ *
|
|
|
+ * @param bufferedInputStream
|
|
|
+ * @param savePath
|
|
|
+ */
|
|
|
+ private void saveFile(BufferedInputStream bufferedInputStream, String savePath) throws IOException {
|
|
|
+ //一次最多读取1k
|
|
|
+ byte[] buffer = new byte[1024];
|
|
|
+ //实际读取的长度
|
|
|
+ int readLenghth;
|
|
|
+ //创建的一个写出的缓冲流
|
|
|
+ BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(new FileOutputStream(new File(savePath)));
|
|
|
+ //文件逐步写入本地
|
|
|
+ while ((readLenghth = bufferedInputStream.read(buffer, 0, 1024)) != -1) {//先读出来,保存在buffer数组中
|
|
|
+ bufferedOutputStream.write(buffer, 0, readLenghth);//再从buffer中取出来保存到本地
|
|
|
+ }
|
|
|
+ //关闭缓冲流
|
|
|
+ bufferedOutputStream.close();
|
|
|
+ bufferedInputStream.close();
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+
|