Browse Source

add:福利吧图片爬取定时任务优化v2

lvzhiqiang 1 year ago
parent
commit
2ca7034199

+ 1 - 1
src/main/java/top/lvzhiqiang/config/MyPicJobs.java

@@ -25,7 +25,7 @@ public class MyPicJobs {
     /**
      * jsoup fuliba
      */
-    @Scheduled(cron = "0 0 3 * * ?", zone = SCHEDULED_ZONE)
+    @Scheduled(cron = "0 0 18 * * ?", zone = SCHEDULED_ZONE)
     public void jsoupFulibaPicJob() throws Exception {
         log.warn("jsoupFulibaPicJob开始==============================");
 

+ 2 - 2
src/main/java/top/lvzhiqiang/controller/PictureInfoController.java

@@ -70,8 +70,8 @@ public class PictureInfoController {
 
     @RequestMapping("/jsoupFulibaPic")
     @ResponseBody
-    public R jsoupFulibaPic() throws Exception {
-        pictureInfoService.jsoupFulibaPic(null, null);
+    public R jsoupFulibaPic(String startPageUrl, Boolean ignoreTimeCompare) throws Exception {
+        pictureInfoService.jsoupFulibaPic(startPageUrl, ignoreTimeCompare);
         return R.ok();
     }
 

+ 9 - 0
src/main/java/top/lvzhiqiang/entity/FileCrawlerImageLog.java

@@ -4,6 +4,7 @@ import com.fasterxml.jackson.annotation.JsonFormat;
 import lombok.Data;
 
 import java.io.Serializable;
+import java.time.LocalDate;
 import java.time.LocalDateTime;
 
 /**
@@ -30,6 +31,8 @@ public class FileCrawlerImageLog implements Serializable {
      */
     private String mainTitle;
 
+    private Long categoryId;
+
     /**
      * 状态(1:成功,2:失败)
      */
@@ -41,6 +44,12 @@ public class FileCrawlerImageLog implements Serializable {
     private String failureCause;
 
     /**
+     * 发布时间
+     */
+    @JsonFormat(pattern = "yyyy-MM-dd", timezone = "GMT+8")
+    private LocalDate publishTime;
+
+    /**
      * 创建时间
      */
     @JsonFormat(pattern = "yyyy-MM-dd HH:mm:ss")

+ 8 - 4
src/main/java/top/lvzhiqiang/mapper/PictureInfoMapper.java

@@ -63,18 +63,22 @@ public interface PictureInfoMapper {
     @Select("delete from file_image where id = #{id}")
     FileImage deleteFileImageById(Long id);
 
-    @Select("SELECT * FROM file_crawler_image WHERE category_id = #{categoryId} and delete_flag = 1 order by publish_time desc limit 1")
-    FileCrawlerImage findLatestCrawlerImage(Integer categoryId);
+    @Select("SELECT * FROM file_crawler_image_log WHERE category_id = #{categoryId} and delete_flag = 1 order by publish_time desc limit 1")
+    FileCrawlerImageLog findLatestCrawlerImage(Integer categoryId);
 
     @Insert("INSERT ignore INTO file_crawler_image(old_name, new_name, category_id, log_id, size, path, remark, orgin_url, publish_time, modify_time) " +
             "VALUES (#{oldName}, #{newName}, #{categoryId}, #{logId}, #{size}, #{path}, #{remark}, #{orginUrl}, #{publishTime}, now())")
     int insertIgnoreFileImage(FileImage fileImage);
 
-    @Insert("INSERT INTO file_crawler_image_log(id, main_url, main_title, status, failure_cause, create_time, modify_time) " +
-            "VALUES (#{id}, #{mainUrl}, #{mainTitle}, #{status}, #{failureCause}, now(), now()) " +
+    @Insert("INSERT INTO file_crawler_image_log(id, main_url, main_title, category_id, status, failure_cause, publish_time, create_time, modify_time) " +
+            "VALUES (#{id}, #{mainUrl}, #{mainTitle}, #{categoryId}, #{status}, #{failureCause}, #{publishTime}, now(), now()) " +
             "ON DUPLICATE KEY UPDATE status=values(status),failure_cause=values(failure_cause),modify_time=now()")
     void insertOrUpdateFileCrawlerImageLog(FileCrawlerImageLog crawlerImageLog);
 
+    @Insert("INSERT ignore INTO file_crawler_image_log(id, main_url, main_title, category_id, status, failure_cause, publish_time, create_time, modify_time) " +
+            "VALUES (#{id}, #{mainUrl}, #{mainTitle}, #{categoryId}, #{status}, #{failureCause}, #{publishTime}, now(), now())")
+    int insertIgnoreFileCrawlerImageLog(FileCrawlerImageLog crawlerImageLog);
+
     @Select({"<script>" +
             "select fi.*,fcic.category_name from file_crawler_image fi left join file_crawler_image_category fcic on fi.category_id = fcic.id " +
             "left join file_crawler_image_log fcil on fi.log_id = fcil.id WHERE 1 = 1" +

+ 1 - 1
src/main/java/top/lvzhiqiang/service/PictureInfoService.java

@@ -26,7 +26,7 @@ public interface PictureInfoService {
 
     R deleteImgs(Long imageId);
 
-    void jsoupFulibaPic(Object o, Object o1) throws Exception;
+    void jsoupFulibaPic(String startPageUrl, Boolean ignoreTimeCompare) throws Exception;
 
     String jsoupFulibaPicSub(String mainUrl, Map<String, String> headerMap, LocalDate publishTime, String logId);
 }

+ 67 - 24
src/main/java/top/lvzhiqiang/service/impl/PictureInfoServiceImpl.java

@@ -236,71 +236,114 @@ public class PictureInfoServiceImpl extends BaseServiceImpl<Object> implements P
 
     @Override
     @Async
-    public void jsoupFulibaPic(Object o, Object o1) throws Exception {
-        log.warn("jsoupFulibaPic 开始:");
+    public void jsoupFulibaPic(String startPageUrl, Boolean ignoreTimeCompare) throws Exception {
+        log.warn("jsoupFulibaPic 开始:startPageUrl={},ignoreTimeCompare={}", startPageUrl, ignoreTimeCompare);
         StopWatch stopWatch = new StopWatch();
         stopWatch.start();
 
-        FileCrawlerImage latestFileCrawlerImage = pictureInfoMapper.findLatestCrawlerImage(1);
+        FileCrawlerImageLog latestFileCrawlerImageLog = pictureInfoMapper.findLatestCrawlerImage(1);
         LocalDate latestDate;
-        if (latestFileCrawlerImage == null) {
+        if (latestFileCrawlerImageLog == null) {
             latestDate = LocalDate.of(1970, 1, 1);
         } else {
-            latestDate = latestFileCrawlerImage.getPublishTime();
+            latestDate = latestFileCrawlerImageLog.getPublishTime();
         }
 
         String crawlerFulibaUrl = InitRunner.dicCodeMap.get("crawler_fuliba_url").getCodeValue();
+        if (StringUtils.isNotEmpty(startPageUrl)) {
+            crawlerFulibaUrl = startPageUrl;
+        }
+
+        if (ignoreTimeCompare == null) {
+            ignoreTimeCompare = false;
+        }
+
         Map<String, String> headerMap = new HashMap<>();
         headerMap.put("referer", crawlerFulibaUrl);
-        Document fulibaDocument;
+        Document fulibaDocument = null;
+        Elements sourceSelects = null;
+        int findCount = 0;
 
         outer:
         while (true) {
-            fulibaDocument = JsoupUtil.requestDocument(crawlerFulibaUrl, JsoupUtil.HTTP_GET, Proxy.NO_PROXY, null, headerMap, null);
-            log.warn("jsoupFulibaPic page success:url={}", crawlerFulibaUrl);
+            for (int i = 0; i < 10; i++) {
+                try {
+                    fulibaDocument = JsoupUtil.requestDocument(crawlerFulibaUrl, JsoupUtil.HTTP_GET, Proxy.NO_PROXY, null, headerMap, null);
+                    sourceSelects = fulibaDocument.select(".content").select("article.excerpt");
+                    if (!sourceSelects.isEmpty()) {
+                        log.warn("jsoupFulibaPic page success:i={},url={}", i, crawlerFulibaUrl);
+                        break;
+                    } else {
+                        log.warn("jsoupFulibaPic page fail:i={},url={}", i, crawlerFulibaUrl);
+                    }
+                } catch (Exception | Error e) {
+                    log.warn("jsoupFulibaPic page fail:i={},url={}", i, crawlerFulibaUrl, e);
+                } finally {
+                    Thread.sleep(5000L);
+                }
+            }
 
-            Elements sourceSelects = fulibaDocument.select(".content").select("article.excerpt");
+            if (sourceSelects == null || sourceSelects.isEmpty()) {
+                log.warn("jsoupFulibaPic page empty break:url={}", crawlerFulibaUrl);
+                break;
+            }
+
+            sourceSelects = fulibaDocument.select(".content").select("article.excerpt");
             for (Element sourceSelect : sourceSelects) {
-                Thread.sleep(2000L);
                 String mainUrl = sourceSelect.select("header").select("a").attr("abs:href");
                 mainUrl = URLDecoder.decode(mainUrl, "UTF-8");
 
+                String mainTitle = sourceSelect.select("header").select("a").attr("title");
+                mainTitle = mainTitle.replace("-福利吧", "");
+
                 String publishTimeStr = sourceSelect.select("div.meta").select("time").text();
                 LocalDate publishTime = LocalDate.parse(publishTimeStr, DateUtils.dateFormatter);
-                if (publishTime.isBefore(latestDate) || publishTime.isEqual(latestDate)) {
+                if (!ignoreTimeCompare && (publishTime.isBefore(latestDate) || publishTime.isEqual(latestDate))) {
+                    log.warn("jsoupFulibaPic page publishTime isbefore latestDate break:mainUrl={},mainTitle={},publishTime={},latestDate={}", mainUrl, mainTitle, publishTimeStr, latestDate.format(DateUtils.dateFormatter));
                     break outer;
                 }
 
                 FileCrawlerImageLog crawlerImageLog = new FileCrawlerImageLog();
                 crawlerImageLog.setId(UUIDUtils.getUUID());
                 crawlerImageLog.setMainUrl(mainUrl);
-                crawlerImageLog.setStatus(1);
-                try {
-                    String mainTitle = SpringUtils.getBean(PictureInfoServiceImpl.class).jsoupFulibaPicSub(mainUrl, headerMap, publishTime, crawlerImageLog.getId());
-                    crawlerImageLog.setMainTitle(mainTitle);
-                } catch (Exception e) {
-                    crawlerImageLog.setFailureCause(e.getMessage());
-                    crawlerImageLog.setStatus(2);
-                } finally {
-                    pictureInfoMapper.insertOrUpdateFileCrawlerImageLog(crawlerImageLog);
-                }
+                crawlerImageLog.setMainTitle(mainTitle);
+                crawlerImageLog.setCategoryId(1L);
+                crawlerImageLog.setStatus(3);
+                crawlerImageLog.setPublishTime(publishTime);
+                int count = pictureInfoMapper.insertIgnoreFileCrawlerImageLog(crawlerImageLog);
+                findCount += count;
             }
 
             // 继续下一页
-            Elements nextSelects = fulibaDocument.select("div.pagination > ul").select("li.next-page");
+            Elements nextSelects = fulibaDocument.select("div.pagination > ul").select("li.next-page").select("a");
             if (!nextSelects.isEmpty()) {
-                crawlerFulibaUrl = nextSelects.get(0).select("a").attr("abs:href");
+                crawlerFulibaUrl = nextSelects.get(0).attr("abs:href");
+                if (StringUtils.isEmpty(crawlerFulibaUrl)) {
+                    break;
+                }
             } else {
                 break;
             }
         }
 
-        log.warn("jsoupFulibaPic 结束:time={}", stopWatch.getTotalTimeSeconds());
+        stopWatch.stop();
+        log.warn("jsoupFulibaPic 结束:findCount={},time={}", findCount, stopWatch.getTotalTimeMillis());
     }
 
     @Override
     @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
     public String jsoupFulibaPicSub(String mainUrl, Map<String, String> headerMap, LocalDate publishTime, String logId) {
+        /*try {
+            String mainTitle = SpringUtils.getBean(PictureInfoServiceImpl.class).jsoupFulibaPicSub(mainUrl, headerMap, publishTime, crawlerImageLog.getId());
+            crawlerImageLog.setMainTitle(mainTitle);
+        } catch (Exception e) {
+            crawlerImageLog.setFailureCause(e.getMessage());
+            crawlerImageLog.setStatus(2);
+        } finally {
+
+        }*/
+
+
         String newName;
         String imageUrl;
         String imageSize;

+ 3 - 2
src/main/java/top/lvzhiqiang/util/JsoupUtil.java

@@ -111,8 +111,9 @@ public class JsoupUtil {
                 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0",
                 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 TheWorld 7",
                 "Mozilla/5.0 (Windows NT 6.1; W…) Gecko/20100101 Firefox/60.0",
-                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"};
-        int i = r.nextInt(15);
+                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",
+                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"};
+        int i = r.nextInt(16);
         return ua[i];
     }
 

+ 7 - 0
src/main/resources/static/crawler.html

@@ -279,6 +279,13 @@
     <div style="margin-right:20px;">
         <span class="font">jsoupFulibaPic</span>
         <form method="post" action="pictureInfo/jsoupFulibaPic">
+            <span>startPageUrl</span>
+            <input type="text" name="startPageUrl" placeholder="https://fuliba2024.net/flhz"/>
+            <span>ignoreTimeCompare</span>
+            <select name="ignoreTimeCompare" style="height: 21.43px;">
+                <option value="false">否</option>
+                <option value="true">是</option>
+            </select>
             <input type="submit" value="提交">
         </form>
     </div>