|
|
@@ -68,7 +68,7 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
public void beforeProxy() {
|
|
|
if (null == proxy) {
|
|
|
if ("dev".equals(env)) {
|
|
|
- proxy = new Proxy(Proxy.Type.SOCKS, new InetSocketAddress("127.0.0.1", 1080));
|
|
|
+ proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("127.0.0.1", 7897));
|
|
|
} else {
|
|
|
proxy = Proxy.NO_PROXY;
|
|
|
}
|
|
|
@@ -144,6 +144,40 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
|
|
|
@Async
|
|
|
@Override
|
|
|
+ public void jsoupLoveFoot4feetpassion(Integer status, Integer isDel, Integer ignoreRetryCount) throws Exception {
|
|
|
+ log.warn("jsoupLoveFoot4feetpassion 开始:status={},isDel={},ignoreRetryCount={}", status, isDel, ignoreRetryCount);
|
|
|
+ StopWatch stopWatch = new StopWatch();
|
|
|
+ stopWatch.start();
|
|
|
+ if (isDel == 1) {
|
|
|
+ crawlerLoveFootMapper.deleteAll();
|
|
|
+ }
|
|
|
+
|
|
|
+ List<DicCode> dicCodeList = dicCodeMapper.findAll();
|
|
|
+ // 获取常量MAP
|
|
|
+ footConstantMap = dicCodeList.stream()
|
|
|
+ .filter(x -> "foot".equals(x.getCodeDesc()) && x.getEnv().contains(env))
|
|
|
+ .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
|
|
|
+ javbusConstantMap = dicCodeList.stream()
|
|
|
+ .filter(x -> x.getType() != null && 1 == x.getType() && x.getEnv().contains(env))
|
|
|
+ .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
|
|
|
+ javdbConstantMap = dicCodeList.stream()
|
|
|
+ .filter(x -> x.getType() != null && 2 == x.getType() && x.getEnv().contains(env))
|
|
|
+ .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
|
|
|
+ // 获取javbus防屏蔽地址
|
|
|
+ javbusUrlList = videoSitePoolMapper.findUrlByTypeAndDeleteFlag(1, 1);
|
|
|
+ if (javbusUrlList.size() == 0) {
|
|
|
+ log.warn("javbusUrlList为空");
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ // 代理及TOKEN设置
|
|
|
+ beforeProxy();
|
|
|
+ // 解析原始站点
|
|
|
+ jsoupLoveFoot4feetpassionSub(status, ignoreRetryCount);
|
|
|
+ log.warn("jsoupLoveFoot4feetpassion 结束:time={}", stopWatch.getTotalTimeSeconds());
|
|
|
+ }
|
|
|
+
|
|
|
+ @Async
|
|
|
+ @Override
|
|
|
@Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
|
|
|
public void jsoupLoveFoot4CrawingFail(Integer status, Integer ignoreRetryCount, String website, String identificationCode) {
|
|
|
log.warn("jjsoupLoveFoot4CrawingFail 开始");
|
|
|
@@ -624,7 +658,10 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
|
|
|
String machiImgUrl = "足舐/".concat(fileName);
|
|
|
|
|
|
- saveFile(response.bodyStream(), javbusConstantMap.get("apics_path").concat(machiImgUrl));
|
|
|
+ if (!"dev".equals(env)) {
|
|
|
+ saveFile(response.bodyStream(), javbusConstantMap.get("apics_path").concat(machiImgUrl));
|
|
|
+ }
|
|
|
+
|
|
|
long end = System.currentTimeMillis();
|
|
|
|
|
|
crawlerLoveFoot.setImgUrl(machiImgUrl);
|
|
|
@@ -844,6 +881,127 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ public void jsoupLoveFoot4feetpassionSub(Integer status, Integer ignoreRetryCount) throws Exception {
|
|
|
+ CrawlerLoveFoot latestLoveFoot = crawlerLoveFootMapper.findLatestInfo4feetpassion();
|
|
|
+
|
|
|
+ LocalDate latestDate;
|
|
|
+ if (latestLoveFoot == null) {
|
|
|
+ latestDate = LocalDate.of(1970, 1, 1);
|
|
|
+ } else {
|
|
|
+ latestDate = latestLoveFoot.getUpdateDate();
|
|
|
+ }
|
|
|
+
|
|
|
+ String feetpassionUrl = footConstantMap.get("feetpassion_url");
|
|
|
+ headerMap.put("referer", feetpassionUrl);
|
|
|
+ Document loveFootDocument;
|
|
|
+ Document loveFootDetailDocument;
|
|
|
+ int successIndex = 0;
|
|
|
+ outer:
|
|
|
+ while (true) {
|
|
|
+ loveFootDocument = JsoupUtil.requestDocument(feetpassionUrl, JsoupUtil.HTTP_GET, proxy, null, headerMap, null);
|
|
|
+ log.warn("jsoupLoveFoot4feetpassionSub page success:url={}", feetpassionUrl);
|
|
|
+
|
|
|
+ Elements sourceSelects = loveFootDocument.select("div.archive").select("article.archive__item");
|
|
|
+ for (Element sourceSelect : sourceSelects) {
|
|
|
+ Thread.sleep(1000L);
|
|
|
+ String sourceUrl = sourceSelect.select("div.eyecatch > a.eyecatch__link").attr("abs:href");
|
|
|
+
|
|
|
+ String sourceUrlOri = sourceUrl;
|
|
|
+ sourceUrl = URLDecoder.decode(sourceUrl, "UTF-8");
|
|
|
+
|
|
|
+ Integer statusInt = 2;
|
|
|
+ Integer typeInt = 1;
|
|
|
+ LocalDate clockDate = null;
|
|
|
+ LocalDate updateDate = null;
|
|
|
+ String keywords = null;
|
|
|
+ try {
|
|
|
+ loveFootDetailDocument = JsoupUtil.requestDocument(sourceUrlOri, JsoupUtil.HTTP_GET, proxy, null, headerMap, null);
|
|
|
+ String clockDateStr = loveFootDetailDocument.select("div.dividerBottom > ul.dateList-main").select("li.icon-clock").text();
|
|
|
+ String updateDateStr = loveFootDetailDocument.select("div.dividerBottom > ul.dateList-main").select("li.icon-update").text();
|
|
|
+ if (StringUtils.isNotEmpty(clockDateStr)) {
|
|
|
+ clockDate = LocalDate.parse(clockDateStr, DateUtils.dateFormatter3);
|
|
|
+ }
|
|
|
+ if (StringUtils.isNotEmpty(updateDateStr)) {
|
|
|
+ updateDate = LocalDate.parse(updateDateStr, DateUtils.dateFormatter3);
|
|
|
+ }
|
|
|
+
|
|
|
+ if (clockDate == null && updateDate == null) {
|
|
|
+ break outer;
|
|
|
+ }
|
|
|
+ if (clockDate == null) {
|
|
|
+ clockDate = updateDate;
|
|
|
+ }
|
|
|
+ if (updateDate == null) {
|
|
|
+ updateDate = clockDate;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (updateDate.isBefore(latestDate) || updateDate.isEqual(latestDate)) {
|
|
|
+ break outer;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 获取关键词
|
|
|
+ Elements keywordsElements = loveFootDetailDocument.select("div.postContents").select("a > span");
|
|
|
+ if (!keywordsElements.isEmpty()) {
|
|
|
+ keywords = keywordsElements.get(0).text().trim();
|
|
|
+ } else {
|
|
|
+ keywordsElements = loveFootDetailDocument.select("div.postContents > section.content").select("p");
|
|
|
+ keywords = keywordsElements.get(0).text().trim();
|
|
|
+ keywords = keywords.split(":")[1].trim();
|
|
|
+ keywords = keywords.substring(0, keywords.lastIndexOf(" ")).trim();
|
|
|
+ }
|
|
|
+
|
|
|
+ if (StringUtils.isNotEmpty(keywords)) {
|
|
|
+ statusInt = 1;
|
|
|
+ successIndex++;
|
|
|
+ log.warn("jsoupLoveFoot4feetpassionSub parseDetailToKeywords success,sourceUrl={},keywords={},successNum={}", sourceUrl, keywords, successIndex);
|
|
|
+ } else {
|
|
|
+ throw new Exception("keywords is null");
|
|
|
+ }
|
|
|
+
|
|
|
+ // 通过关键词获取识别码
|
|
|
+ CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
|
|
|
+ crawlerLoveFoot.setClockDate(clockDate);
|
|
|
+ crawlerLoveFoot.setUpdateDate(updateDate);
|
|
|
+ crawlerLoveFoot.setOrginFeetpassionUrl(sourceUrl);
|
|
|
+ crawlerLoveFoot.setType(2);
|
|
|
+ crawlerLoveFoot.setStatus(3);
|
|
|
+ crawlerLoveFoot.setCreateTime(LocalDateTime.now());
|
|
|
+ crawlerLoveFoot.setName(keywords);
|
|
|
+
|
|
|
+ Thread.sleep(3000);
|
|
|
+ String message = parseKeywordsToCode(crawlerLoveFoot, keywords, "javdb");
|
|
|
+ if (StringUtils.isNotEmpty(message)) {
|
|
|
+ statusInt = 4;
|
|
|
+ throw new Exception(message);
|
|
|
+ }
|
|
|
+
|
|
|
+ crawlerLoveFootMapper.insertOrUpdate4feetpassion(crawlerLoveFoot);
|
|
|
+ } catch (Exception e) {
|
|
|
+ // log.error("jsoupLoveFoot4jpfootSub detail fail,sourceUrl={}", sourceUrl, e);
|
|
|
+ CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
|
|
|
+ crawlerLoveFoot.setIdentificationCode(UUID.randomUUID().toString());
|
|
|
+ crawlerLoveFoot.setOrginFeetpassionUrl(sourceUrl);
|
|
|
+ crawlerLoveFoot.setClockDate(clockDate);
|
|
|
+ crawlerLoveFoot.setUpdateDate(updateDate);
|
|
|
+ crawlerLoveFoot.setName(keywords);
|
|
|
+ crawlerLoveFoot.setType(typeInt);
|
|
|
+ crawlerLoveFoot.setStatus(statusInt);
|
|
|
+ crawlerLoveFoot.setCreateTime(LocalDateTime.now());
|
|
|
+ crawlerLoveFoot.setFailureCause(e.getMessage());
|
|
|
+ crawlerLoveFootMapper.insertOrUpdate4feetpassion(crawlerLoveFoot);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 继续下一页
|
|
|
+ Elements nextSelects = loveFootDocument.select("ul.pager > li.pager__item-next").select("a");
|
|
|
+ if (nextSelects.size() > 0) {
|
|
|
+ feetpassionUrl = nextSelects.get(0).attr("abs:href");
|
|
|
+ } else {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
private String parseKeywordsToCode(CrawlerLoveFoot crawlerLoveFoot, String keywords, String website) {
|
|
|
int retryCount = 0;
|
|
|
Document javbusSearchDocument = null;
|