瀏覽代碼

add:福利吧图片详情页爬取定时任务v1

lvzhiqiang 1 年之前
父節點
當前提交
6a92896ea5

+ 19 - 0
src/main/java/top/lvzhiqiang/config/MyPicJobs.java

@@ -32,4 +32,23 @@ public class MyPicJobs {
         pictureInfoService.jsoupFulibaPic(null, null);
     }
 
+    /**
+     * jsoup fuliba
+     */
+    @Scheduled(cron = "0 0 19 * * ?", zone = SCHEDULED_ZONE)
+    public void jsoupFulibaPicDetailJob1() throws Exception {
+        log.warn("jsoupFulibaPicDetailJob1开始==============================");
+
+        pictureInfoService.jsoupFulibaPicDetail(2, null, null);
+    }
+
+    /**
+     * jsoup fuliba
+     */
+    @Scheduled(cron = "0 0 20 * * ?", zone = SCHEDULED_ZONE)
+    public void jsoupFulibaPicDetailJob2() throws Exception {
+        log.warn("jsoupFulibaPicDetailJob2开始==============================");
+
+        pictureInfoService.jsoupFulibaPicDetail(3, null, null);
+    }
 }

+ 7 - 0
src/main/java/top/lvzhiqiang/controller/PictureInfoController.java

@@ -75,6 +75,13 @@ public class PictureInfoController {
         return R.ok();
     }
 
+    @RequestMapping("/jsoupFulibaPicDetail")
+    @ResponseBody
+    public R jsoupFulibaPicDetail(Integer status, String mainUrl, String id) throws Exception {
+        pictureInfoService.jsoupFulibaPicDetail(status, mainUrl, id);
+        return R.ok();
+    }
+
     public static void main(String[] args) {
         String s = "https://image.baidu.com/search/down?thumburl=https://baidu.com&url=https://tvax3.sinaimg.cn/large/006BNqYCly1ht929ixe8mj309s0dojsb.jpg";
         System.out.println(s.length());

+ 2 - 0
src/main/java/top/lvzhiqiang/entity/FileCrawlerImage.java

@@ -25,6 +25,8 @@ public class FileCrawlerImage extends FileImage {
      */
     private String orginUrl;
 
+    private Integer sort;
+
     /**
      * 发布时间
      */

+ 20 - 2
src/main/java/top/lvzhiqiang/mapper/PictureInfoMapper.java

@@ -66,8 +66,8 @@ public interface PictureInfoMapper {
     @Select("SELECT * FROM file_crawler_image_log WHERE category_id = #{categoryId} and delete_flag = 1 order by publish_time desc limit 1")
     FileCrawlerImageLog findLatestCrawlerImage(Integer categoryId);
 
-    @Insert("INSERT ignore INTO file_crawler_image(old_name, new_name, category_id, log_id, size, path, remark, orgin_url, publish_time, modify_time) " +
-            "VALUES (#{oldName}, #{newName}, #{categoryId}, #{logId}, #{size}, #{path}, #{remark}, #{orginUrl}, #{publishTime}, now())")
+    @Insert("INSERT ignore INTO file_crawler_image(old_name, new_name, category_id, log_id, size, path, remark, orgin_url, sort, modify_time) " +
+            "VALUES (#{oldName}, #{newName}, #{categoryId}, #{logId}, #{size}, #{path}, #{remark}, #{orginUrl}, #{sort}, now())")
     int insertIgnoreFileImage(FileImage fileImage);
 
     @Insert("INSERT INTO file_crawler_image_log(id, main_url, main_title, category_id, status, failure_cause, publish_time, create_time, modify_time) " +
@@ -90,4 +90,22 @@ public interface PictureInfoMapper {
             "</if>" +
             "</script>"})
     List<FileCrawlerImage> getCrawlerImageInfoList(Map<String, Object> params);
+
+    @Select({"<script>" +
+            "select * from file_crawler_image_log WHERE delete_flag = 1" +
+            "<if test=\"categoryId != null\">" +
+            "   and category_id = #{categoryId}" +
+            "</if>" +
+            "<if test=\"id != null and id != ''\">" +
+            "   and id = #{id}" +
+            "</if>" +
+            "<if test=\"mainUrl != null and mainUrl != ''\">" +
+            "   and main_url = #{mainUrl}" +
+            "</if>" +
+            "<if test=\"status != null\">" +
+            "   and status = #{status}" +
+            "</if>" +
+            " order by publish_time asc" +
+            "</script>"})
+    List<FileCrawlerImageLog> findJsoupFulibaPicDetailListByParams(Map<String, Object> params);
 }

+ 3 - 1
src/main/java/top/lvzhiqiang/service/PictureInfoService.java

@@ -28,5 +28,7 @@ public interface PictureInfoService {
 
     void jsoupFulibaPic(String startPageUrl, Boolean ignoreTimeCompare) throws Exception;
 
-    String jsoupFulibaPicSub(String mainUrl, Map<String, String> headerMap, LocalDate publishTime, String logId);
+    void jsoupFulibaPicDetail(Integer status, String mainUrl, String id);
+
+    String jsoupFulibaPicDetailSub(String mainUrl, Map<String, String> headerMap, LocalDate publishTime, String logId);
 }

+ 61 - 25
src/main/java/top/lvzhiqiang/service/impl/PictureInfoServiceImpl.java

@@ -30,10 +30,7 @@ import top.lvzhiqiang.util.*;
 import javax.annotation.Resource;
 import javax.imageio.ImageIO;
 import java.awt.image.BufferedImage;
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
+import java.io.*;
 import java.math.BigDecimal;
 import java.math.RoundingMode;
 import java.net.Proxy;
@@ -58,7 +55,6 @@ public class PictureInfoServiceImpl extends BaseServiceImpl<Object> implements P
     private PictureInfoMapper pictureInfoMapper;
     @Value("${spring.profiles.active}")
     private String env;
-    private final String parentPath = LocalDate.now().format(DateUtils.dateFormatter5);
 
     @Override
     public Object getPictureInfoPage(Map<String, Object> params) {
@@ -331,23 +327,63 @@ public class PictureInfoServiceImpl extends BaseServiceImpl<Object> implements P
     }
 
     @Override
-    @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
-    public String jsoupFulibaPicSub(String mainUrl, Map<String, String> headerMap, LocalDate publishTime, String logId) {
-        /*try {
-            String mainTitle = SpringUtils.getBean(PictureInfoServiceImpl.class).jsoupFulibaPicSub(mainUrl, headerMap, publishTime, crawlerImageLog.getId());
-            crawlerImageLog.setMainTitle(mainTitle);
-        } catch (Exception e) {
-            crawlerImageLog.setFailureCause(e.getMessage());
-            crawlerImageLog.setStatus(2);
-        } finally {
+    @Async
+    public void jsoupFulibaPicDetail(Integer status, String mainUrl, String id) {
+        log.warn("jsoupFulibaPicDetail 开始:status={},mainUrl={},id={}", status, mainUrl, id);
+
+        StopWatch stopWatch = new StopWatch();
+        stopWatch.start();
+
+        Map<String, Object> params = new HashMap<>();
+        params.put("categoryId", 1);
+        if (StringUtils.isNotEmpty(id)) {
+            params.put("id", id);
+        } else if (StringUtils.isNotEmpty(mainUrl)) {
+            params.put("mainUrl", mainUrl);
+        } else if (status != null) {
+            params.put("status", status);
+        } else {
+            throw new BusinessException(30000, "参数错误!");
+        }
+
+        List<FileCrawlerImageLog> fileCrawlerImageLogList = pictureInfoMapper.findJsoupFulibaPicDetailListByParams(params);
+        if (fileCrawlerImageLogList.isEmpty()) {
+            log.warn("jsoupFulibaPicDetail 结束:fileCrawlerImageLogList is empty");
+            return;
+        }
 
-        }*/
+        String crawlerFulibaUrl = InitRunner.dicCodeMap.get("crawler_fuliba_url").getCodeValue();
+        Map<String, String> headerMap = new HashMap<>();
+        headerMap.put("referer", crawlerFulibaUrl);
+        int successCount = 0;
+        int failCount = 0;
+        for (FileCrawlerImageLog fileCrawlerImageLog : fileCrawlerImageLogList) {
+            try {
+                Thread.sleep(5000L);
+
+                SpringUtils.getBean(PictureInfoServiceImpl.class).jsoupFulibaPicDetailSub(fileCrawlerImageLog.getMainUrl(), headerMap, fileCrawlerImageLog.getPublishTime(), fileCrawlerImageLog.getId());
+                fileCrawlerImageLog.setStatus(1);
+                successCount++;
+            } catch (Exception e) {
+                fileCrawlerImageLog.setFailureCause(e.getMessage());
+                fileCrawlerImageLog.setStatus(2);
+                failCount++;
+            } finally {
+                pictureInfoMapper.insertOrUpdateFileCrawlerImageLog(fileCrawlerImageLog);
+                log.warn("jsoupFulibaPicDetail update status:mainUrl={},status={}", mainUrl, fileCrawlerImageLog.getStatus());
+            }
+        }
 
+        stopWatch.stop();
+        log.warn("jsoupFulibaPicDetail 结束:totalSize={},successCount={},failCount={},time={}", fileCrawlerImageLogList.size(), successCount, failCount, stopWatch.getTotalTimeMillis());
+    }
 
+    @Override
+    @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
+    public String jsoupFulibaPicDetailSub(String mainUrl, Map<String, String> headerMap, LocalDate publishTime, String logId) {
         String newName;
         String imageUrl;
         String imageSize;
-        String mainTitle;
         Document fulibaDetailDocument;
         String ftpImageCrawlerBasePath = InitRunner.dicCodeMap.get("ftp_image_crawler_basepath").getCodeValue();
         String ftpThumbnailCrawlerBasePath = InitRunner.dicCodeMap.get("ftp_thumbnail_crawler_basepath").getCodeValue();
@@ -355,11 +391,11 @@ public class PictureInfoServiceImpl extends BaseServiceImpl<Object> implements P
         String srcUrl = "";
         try {
             fulibaDetailDocument = JsoupUtil.requestDocument(mainUrl, JsoupUtil.HTTP_GET, Proxy.NO_PROXY, null, headerMap, null);
-            log.warn("jsoupFulibaPicSub detail success:url={},logId={}", mainUrl, logId);
+            log.warn("jsoupFulibaPicDetailSub start:mainUrl={},logId={}", mainUrl, logId);
             Elements imgEles = fulibaDetailDocument.select("div.content > article.article-content").select("img");
-            mainTitle = fulibaDetailDocument.select("div.content > header.article-header > h1.article-title").select("a").text();
+            int i = 0;
+            String parentPath = "1" + File.separator + publishTime.format(DateUtils.dateFormatter5);
             for (Element imgEle : imgEles) {
-
                 srcUrl = imgEle.attr("src");
                 String altTitle = imgEle.attr("alt");
 
@@ -377,14 +413,14 @@ public class PictureInfoServiceImpl extends BaseServiceImpl<Object> implements P
                 imageSize = BigDecimal.valueOf(imageBytes.length).divide(new BigDecimal("1024")).setScale(0, RoundingMode.UP).toPlainString().concat("KB");
                 fileCrawlerImage.setSize(imageSize);
 
-                imageUrl = parentPath + "/" + newName;
+                imageUrl = parentPath + File.separator + newName;
                 fileCrawlerImage.setPath(imageUrl);
 
                 fileCrawlerImage.setRemark("");
                 fileCrawlerImage.setCategoryId(1L);
                 fileCrawlerImage.setOrginUrl(srcUrl);
                 fileCrawlerImage.setLogId(logId);
-                fileCrawlerImage.setPublishTime(publishTime);
+                fileCrawlerImage.setSort(++i);
                 int count = pictureInfoMapper.insertIgnoreFileImage(fileCrawlerImage);
                 if (count > 0) {
                     InputStream imageStream1 = new ByteArrayInputStream(imageBytes);
@@ -401,13 +437,13 @@ public class PictureInfoServiceImpl extends BaseServiceImpl<Object> implements P
             }
         } catch (Exception e) {
             // 异常,删除已经上传的文件
-            if (delPathList.size() > 0) {
-                delPathList.stream().forEach(delPath -> FtpUtil.delFile(delPath));
+            if (!delPathList.isEmpty()) {
+                delPathList.forEach(FtpUtil::delFile);
             }
 
-            log.error("jsoupFulibaPicSub exception,mainUrl={},publishTime={},srcUrl={}", mainUrl, publishTime, srcUrl, e);
+            log.error("jsoupFulibaPicDetailSub exception,mainUrl={},publishTime={},srcUrl={}", mainUrl, publishTime, srcUrl, e);
             throw new BusinessException(30000, e.getMessage());
         }
-        return mainTitle;
+        return "";
     }
 }

+ 16 - 0
src/main/resources/static/crawler.html

@@ -289,6 +289,22 @@
             <input type="submit" value="提交">
         </form>
     </div>
+    <div style="margin-right:20px;">
+        <span class="font">jsoupFulibaPicDetail</span>
+        <form method="post" action="pictureInfo/jsoupFulibaPicDetail">
+            <span>status</span>
+            <select name="status" style="height: 21.43px;">
+                <option value="2">失败</option>
+                <option value="3">待爬取</option>
+                <option value="1">成功</option>
+            </select>
+            <span>mainUrl</span>
+            <input type="text" name="mainUrl"/>
+            <span>id</span>
+            <input type="text" name="id"/>
+            <input type="submit" value="提交">
+        </form>
+    </div>
 </div>
 </body>
 </html>