|
|
@@ -236,71 +236,114 @@ public class PictureInfoServiceImpl extends BaseServiceImpl<Object> implements P
|
|
|
|
|
|
@Override
|
|
|
@Async
|
|
|
- public void jsoupFulibaPic(Object o, Object o1) throws Exception {
|
|
|
- log.warn("jsoupFulibaPic 开始:");
|
|
|
+ public void jsoupFulibaPic(String startPageUrl, Boolean ignoreTimeCompare) throws Exception {
|
|
|
+ log.warn("jsoupFulibaPic 开始:startPageUrl={},ignoreTimeCompare={}", startPageUrl, ignoreTimeCompare);
|
|
|
StopWatch stopWatch = new StopWatch();
|
|
|
stopWatch.start();
|
|
|
|
|
|
- FileCrawlerImage latestFileCrawlerImage = pictureInfoMapper.findLatestCrawlerImage(1);
|
|
|
+ FileCrawlerImageLog latestFileCrawlerImageLog = pictureInfoMapper.findLatestCrawlerImage(1);
|
|
|
LocalDate latestDate;
|
|
|
- if (latestFileCrawlerImage == null) {
|
|
|
+ if (latestFileCrawlerImageLog == null) {
|
|
|
latestDate = LocalDate.of(1970, 1, 1);
|
|
|
} else {
|
|
|
- latestDate = latestFileCrawlerImage.getPublishTime();
|
|
|
+ latestDate = latestFileCrawlerImageLog.getPublishTime();
|
|
|
}
|
|
|
|
|
|
String crawlerFulibaUrl = InitRunner.dicCodeMap.get("crawler_fuliba_url").getCodeValue();
|
|
|
+ if (StringUtils.isNotEmpty(startPageUrl)) {
|
|
|
+ crawlerFulibaUrl = startPageUrl;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (ignoreTimeCompare == null) {
|
|
|
+ ignoreTimeCompare = false;
|
|
|
+ }
|
|
|
+
|
|
|
Map<String, String> headerMap = new HashMap<>();
|
|
|
headerMap.put("referer", crawlerFulibaUrl);
|
|
|
- Document fulibaDocument;
|
|
|
+ Document fulibaDocument = null;
|
|
|
+ Elements sourceSelects = null;
|
|
|
+ int findCount = 0;
|
|
|
|
|
|
outer:
|
|
|
while (true) {
|
|
|
- fulibaDocument = JsoupUtil.requestDocument(crawlerFulibaUrl, JsoupUtil.HTTP_GET, Proxy.NO_PROXY, null, headerMap, null);
|
|
|
- log.warn("jsoupFulibaPic page success:url={}", crawlerFulibaUrl);
|
|
|
+ for (int i = 0; i < 10; i++) {
|
|
|
+ try {
|
|
|
+ fulibaDocument = JsoupUtil.requestDocument(crawlerFulibaUrl, JsoupUtil.HTTP_GET, Proxy.NO_PROXY, null, headerMap, null);
|
|
|
+ sourceSelects = fulibaDocument.select(".content").select("article.excerpt");
|
|
|
+ if (!sourceSelects.isEmpty()) {
|
|
|
+ log.warn("jsoupFulibaPic page success:i={},url={}", i, crawlerFulibaUrl);
|
|
|
+ break;
|
|
|
+ } else {
|
|
|
+ log.warn("jsoupFulibaPic page fail:i={},url={}", i, crawlerFulibaUrl);
|
|
|
+ }
|
|
|
+ } catch (Exception | Error e) {
|
|
|
+ log.warn("jsoupFulibaPic page fail:i={},url={}", i, crawlerFulibaUrl, e);
|
|
|
+ } finally {
|
|
|
+ Thread.sleep(5000L);
|
|
|
+ }
|
|
|
+ }
|
|
|
|
|
|
- Elements sourceSelects = fulibaDocument.select(".content").select("article.excerpt");
|
|
|
+ if (sourceSelects == null || sourceSelects.isEmpty()) {
|
|
|
+ log.warn("jsoupFulibaPic page empty break:url={}", crawlerFulibaUrl);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ sourceSelects = fulibaDocument.select(".content").select("article.excerpt");
|
|
|
for (Element sourceSelect : sourceSelects) {
|
|
|
- Thread.sleep(2000L);
|
|
|
String mainUrl = sourceSelect.select("header").select("a").attr("abs:href");
|
|
|
mainUrl = URLDecoder.decode(mainUrl, "UTF-8");
|
|
|
|
|
|
+ String mainTitle = sourceSelect.select("header").select("a").attr("title");
|
|
|
+ mainTitle = mainTitle.replace("-福利吧", "");
|
|
|
+
|
|
|
String publishTimeStr = sourceSelect.select("div.meta").select("time").text();
|
|
|
LocalDate publishTime = LocalDate.parse(publishTimeStr, DateUtils.dateFormatter);
|
|
|
- if (publishTime.isBefore(latestDate) || publishTime.isEqual(latestDate)) {
|
|
|
+ if (!ignoreTimeCompare && (publishTime.isBefore(latestDate) || publishTime.isEqual(latestDate))) {
|
|
|
+ log.warn("jsoupFulibaPic page publishTime isbefore latestDate break:mainUrl={},mainTitle={},publishTime={},latestDate={}", mainUrl, mainTitle, publishTimeStr, latestDate.format(DateUtils.dateFormatter));
|
|
|
break outer;
|
|
|
}
|
|
|
|
|
|
FileCrawlerImageLog crawlerImageLog = new FileCrawlerImageLog();
|
|
|
crawlerImageLog.setId(UUIDUtils.getUUID());
|
|
|
crawlerImageLog.setMainUrl(mainUrl);
|
|
|
- crawlerImageLog.setStatus(1);
|
|
|
- try {
|
|
|
- String mainTitle = SpringUtils.getBean(PictureInfoServiceImpl.class).jsoupFulibaPicSub(mainUrl, headerMap, publishTime, crawlerImageLog.getId());
|
|
|
- crawlerImageLog.setMainTitle(mainTitle);
|
|
|
- } catch (Exception e) {
|
|
|
- crawlerImageLog.setFailureCause(e.getMessage());
|
|
|
- crawlerImageLog.setStatus(2);
|
|
|
- } finally {
|
|
|
- pictureInfoMapper.insertOrUpdateFileCrawlerImageLog(crawlerImageLog);
|
|
|
- }
|
|
|
+ crawlerImageLog.setMainTitle(mainTitle);
|
|
|
+ crawlerImageLog.setCategoryId(1L);
|
|
|
+ crawlerImageLog.setStatus(3);
|
|
|
+ crawlerImageLog.setPublishTime(publishTime);
|
|
|
+ int count = pictureInfoMapper.insertIgnoreFileCrawlerImageLog(crawlerImageLog);
|
|
|
+ findCount += count;
|
|
|
}
|
|
|
|
|
|
// 继续下一页
|
|
|
- Elements nextSelects = fulibaDocument.select("div.pagination > ul").select("li.next-page");
|
|
|
+ Elements nextSelects = fulibaDocument.select("div.pagination > ul").select("li.next-page").select("a");
|
|
|
if (!nextSelects.isEmpty()) {
|
|
|
- crawlerFulibaUrl = nextSelects.get(0).select("a").attr("abs:href");
|
|
|
+ crawlerFulibaUrl = nextSelects.get(0).attr("abs:href");
|
|
|
+ if (StringUtils.isEmpty(crawlerFulibaUrl)) {
|
|
|
+ break;
|
|
|
+ }
|
|
|
} else {
|
|
|
break;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- log.warn("jsoupFulibaPic 结束:time={}", stopWatch.getTotalTimeSeconds());
|
|
|
+ stopWatch.stop();
|
|
|
+ log.warn("jsoupFulibaPic 结束:findCount={},time={}", findCount, stopWatch.getTotalTimeMillis());
|
|
|
}
|
|
|
|
|
|
@Override
|
|
|
@Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
|
|
|
public String jsoupFulibaPicSub(String mainUrl, Map<String, String> headerMap, LocalDate publishTime, String logId) {
|
|
|
+ /*try {
|
|
|
+ String mainTitle = SpringUtils.getBean(PictureInfoServiceImpl.class).jsoupFulibaPicSub(mainUrl, headerMap, publishTime, crawlerImageLog.getId());
|
|
|
+ crawlerImageLog.setMainTitle(mainTitle);
|
|
|
+ } catch (Exception e) {
|
|
|
+ crawlerImageLog.setFailureCause(e.getMessage());
|
|
|
+ crawlerImageLog.setStatus(2);
|
|
|
+ } finally {
|
|
|
+
|
|
|
+ }*/
|
|
|
+
|
|
|
+
|
|
|
String newName;
|
|
|
String imageUrl;
|
|
|
String imageSize;
|