|
@@ -109,6 +109,38 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
@Async
|
|
@Async
|
|
|
@Override
|
|
@Override
|
|
|
@Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
|
|
@Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
|
|
|
|
|
+ public void jsoupLoveFoot4jpfoot(Integer status, Integer isDel, Integer ignoreRetryCount) throws Exception {
|
|
|
|
|
+ log.warn("jsoupLoveFoot4jpfoot 开始:status={},isDel={},ignoreRetryCount={}", status, isDel, ignoreRetryCount);
|
|
|
|
|
+ StopWatch stopWatch = new StopWatch();
|
|
|
|
|
+ stopWatch.start();
|
|
|
|
|
+ if (isDel == 1) {
|
|
|
|
|
+ crawlerLoveFootMapper.deleteAll();
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ List<DicCode> dicCodeList = dicCodeMapper.findAll();
|
|
|
|
|
+ // 获取常量MAP
|
|
|
|
|
+ footConstantMap = dicCodeList.stream()
|
|
|
|
|
+ .filter(x -> "foot".equals(x.getCodeDesc()) && x.getEnv().contains(env))
|
|
|
|
|
+ .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
|
|
|
|
|
+ javbusConstantMap = dicCodeList.stream()
|
|
|
|
|
+ .filter(x -> x.getType() != null && 1 == x.getType() && x.getEnv().contains(env))
|
|
|
|
|
+ .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
|
|
|
|
|
+ // 获取javbus防屏蔽地址
|
|
|
|
|
+ javbusUrlList = videoSitePoolMapper.findUrlByTypeAndDeleteFlag(1, 1);
|
|
|
|
|
+ if (javbusUrlList.size() == 0) {
|
|
|
|
|
+ log.warn("javbusUrlList为空");
|
|
|
|
|
+ return;
|
|
|
|
|
+ }
|
|
|
|
|
+ // 代理及TOKEN设置
|
|
|
|
|
+ beforeProxy();
|
|
|
|
|
+ // 解析原始站点
|
|
|
|
|
+ jsoupLoveFoot4jpfootSub(status, ignoreRetryCount);
|
|
|
|
|
+ log.warn("jsoupLoveFoot4jpfoot 结束:time={}", stopWatch.getTotalTimeSeconds());
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ @Async
|
|
|
|
|
+ @Override
|
|
|
|
|
+ @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
|
|
|
public void jsoupLoveFoot4CrawingFail(Integer status, Integer ignoreRetryCount, String website) {
|
|
public void jsoupLoveFoot4CrawingFail(Integer status, Integer ignoreRetryCount, String website) {
|
|
|
log.warn("jjsoupLoveFoot4CrawingFail 开始");
|
|
log.warn("jjsoupLoveFoot4CrawingFail 开始");
|
|
|
StopWatch stopWatch = new StopWatch();
|
|
StopWatch stopWatch = new StopWatch();
|
|
@@ -448,7 +480,7 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
|
|
|
|
|
@Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
|
|
@Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
|
|
|
public void jsoupLoveFoot4avnoashiSub(Integer status, Integer ignoreRetryCount) throws Exception {
|
|
public void jsoupLoveFoot4avnoashiSub(Integer status, Integer ignoreRetryCount) throws Exception {
|
|
|
- CrawlerLoveFoot latestLoveFoot = crawlerLoveFootMapper.findLatestInfo();
|
|
|
|
|
|
|
+ CrawlerLoveFoot latestLoveFoot = crawlerLoveFootMapper.findLatestInfo4avnoashi();
|
|
|
|
|
|
|
|
LocalDate latestDate;
|
|
LocalDate latestDate;
|
|
|
if (latestLoveFoot == null) {
|
|
if (latestLoveFoot == null) {
|
|
@@ -500,7 +532,7 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
|
|
CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
|
|
|
crawlerLoveFoot.setClockDate(clockDate);
|
|
crawlerLoveFoot.setClockDate(clockDate);
|
|
|
crawlerLoveFoot.setUpdateDate(updateDate);
|
|
crawlerLoveFoot.setUpdateDate(updateDate);
|
|
|
- crawlerLoveFoot.setOrginUrl(sourceUrl);
|
|
|
|
|
|
|
+ crawlerLoveFoot.setOrginAvnoashiUrl(sourceUrl);
|
|
|
crawlerLoveFoot.setType(2);
|
|
crawlerLoveFoot.setType(2);
|
|
|
crawlerLoveFoot.setStatus(3);
|
|
crawlerLoveFoot.setStatus(3);
|
|
|
crawlerLoveFoot.setCreateTime(LocalDateTime.now());
|
|
crawlerLoveFoot.setCreateTime(LocalDateTime.now());
|
|
@@ -510,12 +542,12 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
throw new Exception(message);
|
|
throw new Exception(message);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- crawlerLoveFootMapper.insertOrUpdate(crawlerLoveFoot);
|
|
|
|
|
|
|
+ crawlerLoveFootMapper.insertOrUpdate4avnoashi(crawlerLoveFoot);
|
|
|
} catch (Exception e) {
|
|
} catch (Exception e) {
|
|
|
log.error("jsoupLoveFoot4avnoashiSub detail fail,sourceUrl={}", sourceUrl, e);
|
|
log.error("jsoupLoveFoot4avnoashiSub detail fail,sourceUrl={}", sourceUrl, e);
|
|
|
CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
|
|
CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
|
|
|
crawlerLoveFoot.setIdentificationCode(UUID.randomUUID().toString());
|
|
crawlerLoveFoot.setIdentificationCode(UUID.randomUUID().toString());
|
|
|
- crawlerLoveFoot.setOrginUrl(sourceUrl);
|
|
|
|
|
|
|
+ crawlerLoveFoot.setOrginAvnoashiUrl(sourceUrl);
|
|
|
crawlerLoveFoot.setClockDate(clockDate);
|
|
crawlerLoveFoot.setClockDate(clockDate);
|
|
|
crawlerLoveFoot.setUpdateDate(updateDate);
|
|
crawlerLoveFoot.setUpdateDate(updateDate);
|
|
|
crawlerLoveFoot.setName(keywords);
|
|
crawlerLoveFoot.setName(keywords);
|
|
@@ -523,7 +555,7 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
crawlerLoveFoot.setStatus(statusInt);
|
|
crawlerLoveFoot.setStatus(statusInt);
|
|
|
crawlerLoveFoot.setCreateTime(LocalDateTime.now());
|
|
crawlerLoveFoot.setCreateTime(LocalDateTime.now());
|
|
|
crawlerLoveFoot.setFailureCause(e.getMessage());
|
|
crawlerLoveFoot.setFailureCause(e.getMessage());
|
|
|
- crawlerLoveFootMapper.insertOrUpdate(crawlerLoveFoot);
|
|
|
|
|
|
|
+ crawlerLoveFootMapper.insertOrUpdate4avnoashi(crawlerLoveFoot);
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -537,6 +569,96 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
|
|
|
|
|
+ public void jsoupLoveFoot4jpfootSub(Integer status, Integer ignoreRetryCount) throws Exception {
|
|
|
|
|
+ CrawlerLoveFoot latestLoveFoot = crawlerLoveFootMapper.findLatestInfo4jpfoot();
|
|
|
|
|
+
|
|
|
|
|
+ LocalDate latestDate;
|
|
|
|
|
+ if (latestLoveFoot == null) {
|
|
|
|
|
+ latestDate = LocalDate.of(1970, 1, 1);
|
|
|
|
|
+ } else {
|
|
|
|
|
+ latestDate = latestLoveFoot.getUpdateDate();
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ String jpfootUrl = footConstantMap.get("jpfoot_url");
|
|
|
|
|
+ headerMap.put("referer", jpfootUrl);
|
|
|
|
|
+ Document loveFootDocument;
|
|
|
|
|
+ Document loveFootDetailDocument;
|
|
|
|
|
+ outer:
|
|
|
|
|
+ while (true) {
|
|
|
|
|
+ loveFootDocument = JsoupUtil.requestDocument(jpfootUrl, JsoupUtil.HTTP_GET, proxy, null, headerMap, null);
|
|
|
|
|
+ log.warn("jsoupLoveFoot4jpfootSub page success:url={}", jpfootUrl);
|
|
|
|
|
+
|
|
|
|
|
+ Elements sourceSelects = loveFootDocument.select("article.mainContainer > div.av_itemGrid").select("article.av_item");
|
|
|
|
|
+ for (Element sourceSelect : sourceSelects) {
|
|
|
|
|
+ Thread.sleep(1000L);
|
|
|
|
|
+ String sourceUrl = sourceSelect.select("a.av_itemLink").attr("abs:href");
|
|
|
|
|
+
|
|
|
|
|
+ Integer statusInt = 2;
|
|
|
|
|
+ Integer typeInt = 1;
|
|
|
|
|
+ LocalDate clockDate = null;
|
|
|
|
|
+ LocalDate updateDate = null;
|
|
|
|
|
+ String keywords = null;
|
|
|
|
|
+ try {
|
|
|
|
|
+ loveFootDetailDocument = JsoupUtil.requestDocument(sourceUrl, JsoupUtil.HTTP_GET, proxy, null, headerMap, null);
|
|
|
|
|
+ String dateStr = loveFootDetailDocument.select("div.avdetail_date").select("span.avdetail_dateText").text();
|
|
|
|
|
+ clockDate = LocalDate.parse(dateStr, DateUtils.dateFormatter4);
|
|
|
|
|
+ updateDate = clockDate;
|
|
|
|
|
+
|
|
|
|
|
+ if (updateDate.isBefore(latestDate) || updateDate.isEqual(latestDate)) {
|
|
|
|
|
+ break outer;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 获取关键词
|
|
|
|
|
+ keywords = loveFootDetailDocument.select("div.avdetail_detailTop").select("p.avdetail_detailTopTitle").text().trim();
|
|
|
|
|
+ if (StringUtils.isNotEmpty(keywords)) {
|
|
|
|
|
+ statusInt = 1;
|
|
|
|
|
+ log.warn("jsoupLoveFoot4jpfootSub parseDetailToKeywords success,sourceUrl={},keywords={}", sourceUrl, keywords);
|
|
|
|
|
+ } else {
|
|
|
|
|
+ throw new Exception("keywords is null");
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 通过关键词获取识别码
|
|
|
|
|
+ CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
|
|
|
|
|
+ crawlerLoveFoot.setClockDate(clockDate);
|
|
|
|
|
+ crawlerLoveFoot.setUpdateDate(updateDate);
|
|
|
|
|
+ crawlerLoveFoot.setOrginJpfootUrl(sourceUrl);
|
|
|
|
|
+ crawlerLoveFoot.setType(2);
|
|
|
|
|
+ crawlerLoveFoot.setStatus(3);
|
|
|
|
|
+ crawlerLoveFoot.setCreateTime(LocalDateTime.now());
|
|
|
|
|
+ String message = parseKeywordsToCode(crawlerLoveFoot, keywords);
|
|
|
|
|
+ if (StringUtils.isNotEmpty(message)) {
|
|
|
|
|
+ statusInt = 4;
|
|
|
|
|
+ throw new Exception(message);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ crawlerLoveFootMapper.insertOrUpdate4jpfoot(crawlerLoveFoot);
|
|
|
|
|
+ } catch (Exception e) {
|
|
|
|
|
+ log.error("jsoupLoveFoot4jpfootSub detail fail,sourceUrl={}", sourceUrl, e);
|
|
|
|
|
+ CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
|
|
|
|
|
+ crawlerLoveFoot.setIdentificationCode(UUID.randomUUID().toString());
|
|
|
|
|
+ crawlerLoveFoot.setOrginJpfootUrl(sourceUrl);
|
|
|
|
|
+ crawlerLoveFoot.setClockDate(clockDate);
|
|
|
|
|
+ crawlerLoveFoot.setUpdateDate(updateDate);
|
|
|
|
|
+ crawlerLoveFoot.setName(keywords);
|
|
|
|
|
+ crawlerLoveFoot.setType(typeInt);
|
|
|
|
|
+ crawlerLoveFoot.setStatus(statusInt);
|
|
|
|
|
+ crawlerLoveFoot.setCreateTime(LocalDateTime.now());
|
|
|
|
|
+ crawlerLoveFoot.setFailureCause(e.getMessage());
|
|
|
|
|
+ crawlerLoveFootMapper.insertOrUpdate4jpfoot(crawlerLoveFoot);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 继续下一页
|
|
|
|
|
+ Elements nextSelects = loveFootDocument.select("nav.pagination > div.nav-links").select("a.next");
|
|
|
|
|
+ if (nextSelects.size() > 0) {
|
|
|
|
|
+ jpfootUrl = nextSelects.get(0).attr("abs:href");
|
|
|
|
|
+ } else {
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
private String parseKeywordsToCode(CrawlerLoveFoot crawlerLoveFoot, String keywords) {
|
|
private String parseKeywordsToCode(CrawlerLoveFoot crawlerLoveFoot, String keywords) {
|
|
|
int retryCount = 0;
|
|
int retryCount = 0;
|
|
|
Document javbusSearchDocument = null;
|
|
Document javbusSearchDocument = null;
|