Kaynağa Gözat

add:福利吧图片爬取定时任务v1

lvzhiqiang 1 yıl önce
ebeveyn
işleme
7d0ccc03cd

+ 35 - 0
src/main/java/top/lvzhiqiang/config/MyPicJobs.java

@@ -0,0 +1,35 @@
+package top.lvzhiqiang.config;
+
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.scheduling.annotation.Scheduled;
+import org.springframework.stereotype.Component;
+import top.lvzhiqiang.service.PictureInfoService;
+
+import javax.annotation.Resource;
+
+/**
+ * 图片定时任务
+ *
+ * @author lvzhiqiang
+ * 2024/9/4 16:08
+ */
+@Component
+@Slf4j
+public class MyPicJobs {
+
+    private static final String SCHEDULED_ZONE = "Asia/Shanghai";
+
+    @Resource
+    private PictureInfoService pictureInfoService;
+
+    /**
+     * jsoup fuliba
+     */
+    @Scheduled(cron = "0 0 3 * * ?", zone = SCHEDULED_ZONE)
+    public void jsoupFulibaPicJob() throws Exception {
+        log.warn("jsoupFulibaPicJob开始==============================");
+
+        pictureInfoService.jsoupFulibaPic(null, null);
+    }
+
+}

+ 12 - 0
src/main/java/top/lvzhiqiang/controller/PictureInfoController.java

@@ -69,4 +69,16 @@ public class PictureInfoController {
 
         return pictureInfoService.deleteImgs(imageId);
     }
+
+    @RequestMapping("/jsoupFulibaPic")
+    @ResponseBody
+    public R jsoupFulibaPic() throws Exception {
+        pictureInfoService.jsoupFulibaPic(null, null);
+        return R.ok();
+    }
+
+    public static void main(String[] args) {
+        String s = "https://image.baidu.com/search/down?thumburl=https://baidu.com&url=https://tvax3.sinaimg.cn/large/006BNqYCly1ht929ixe8mj309s0dojsb.jpg";
+        System.out.println(s.length());
+    }
 }

+ 33 - 0
src/main/java/top/lvzhiqiang/entity/FileCrawlerImage.java

@@ -0,0 +1,33 @@
+package top.lvzhiqiang.entity;
+
+import com.fasterxml.jackson.annotation.JsonFormat;
+import lombok.Data;
+
+import java.time.LocalDate;
+import java.time.LocalDateTime;
+
+/**
+ * 文件-爬虫-图片表
+ *
+ * @author lvzhiqiang
+ * 2024/9/4 16:08
+ */
+@Data
+public class FileCrawlerImage extends FileImage {
+
+    /**
+     * log id
+     */
+    private String logId;
+
+    /**
+     * 原始URL
+     */
+    private String orginUrl;
+
+    /**
+     * 发布时间
+     */
+    @JsonFormat(pattern = "yyyy-MM-dd", timezone = "GMT+8")
+    private LocalDate publishTime;
+}

+ 54 - 0
src/main/java/top/lvzhiqiang/entity/FileCrawlerImageLog.java

@@ -0,0 +1,54 @@
+package top.lvzhiqiang.entity;
+
+import com.fasterxml.jackson.annotation.JsonFormat;
+import lombok.Data;
+
+import java.io.Serializable;
+import java.time.LocalDateTime;
+
+/**
+ * 文件-爬虫-图片表-日志
+ *
+ * @author lvzhiqiang
+ * 2024/9/6 10:59
+ */
+@Data
+public class FileCrawlerImageLog implements Serializable {
+
+    /**
+     * 主键
+     */
+    private String id;
+
+    /**
+     * 主URL
+     */
+    private String mainUrl;
+
+    /**
+     * 主标题
+     */
+    private String mainTitle;
+
+    /**
+     * 状态(1:成功,2:失败)
+     */
+    private Integer status;
+
+    /**
+     * 失败原因
+     */
+    private String failureCause;
+
+    /**
+     * 创建时间
+     */
+    @JsonFormat(pattern = "yyyy-MM-dd HH:mm:ss")
+    private LocalDateTime createTime;
+
+    /**
+     * 最后修改时间
+     */
+    @JsonFormat(pattern = "yyyy-MM-dd HH:mm:ss")
+    private LocalDateTime modifyTime;
+}

+ 12 - 3
src/main/java/top/lvzhiqiang/mapper/PictureInfoMapper.java

@@ -3,9 +3,7 @@ package top.lvzhiqiang.mapper;
 import org.apache.ibatis.annotations.Insert;
 import org.apache.ibatis.annotations.Select;
 import org.apache.ibatis.annotations.Update;
-import top.lvzhiqiang.entity.FileImage;
-import top.lvzhiqiang.entity.VideoCast;
-import top.lvzhiqiang.entity.VideoGenres;
+import top.lvzhiqiang.entity.*;
 
 import java.util.List;
 import java.util.Map;
@@ -65,4 +63,15 @@ public interface PictureInfoMapper {
     @Select("delete from file_image where id = #{id}")
     FileImage deleteFileImageById(Long id);
 
+    @Select("SELECT * FROM file_crawler_image WHERE category_id = #{categoryId} and delete_flag = 1 order by publish_time desc limit 1")
+    FileCrawlerImage findLatestCrawlerImage(Integer categoryId);
+
+    @Insert("INSERT ignore INTO file_crawler_image(old_name, new_name, category_id, log_id, size, path, remark, orgin_url, publish_time, modify_time) " +
+            "VALUES (#{oldName}, #{newName}, #{categoryId}, #{logId}, #{size}, #{path}, #{remark}, #{orginUrl}, #{publishTime}, now())")
+    int insertIgnoreFileImage(FileImage fileImage);
+
+    @Insert("INSERT INTO file_crawler_image_log(id, main_url, main_title, status, failure_cause, create_time, modify_time) " +
+            "VALUES (#{id}, #{mainUrl}, #{mainTitle}, #{status}, #{failureCause}, now(), now()) " +
+            "ON DUPLICATE KEY UPDATE status=values(status),failure_cause=values(failure_cause),modify_time=now()")
+    void insertOrUpdateFileCrawlerImageLog(FileCrawlerImageLog crawlerImageLog);
 }

+ 5 - 0
src/main/java/top/lvzhiqiang/service/PictureInfoService.java

@@ -5,6 +5,7 @@ import org.springframework.web.multipart.MultipartFile;
 import top.lvzhiqiang.dto.R;
 import top.lvzhiqiang.entity.FileImage;
 
+import java.time.LocalDate;
 import java.util.Map;
 
 /**
@@ -24,4 +25,8 @@ public interface PictureInfoService {
     R insertOrUpdateImg(MultipartFile file, String remark, Long categoryId, String id);
 
     R deleteImgs(Long imageId);
+
+    void jsoupFulibaPic(Object o, Object o1) throws Exception;
+
+    String jsoupFulibaPicSub(String mainUrl, Map<String, String> headerMap, LocalDate publishTime, String logId);
 }

+ 149 - 3
src/main/java/top/lvzhiqiang/service/impl/PictureInfoServiceImpl.java

@@ -4,19 +4,28 @@ import com.alibaba.fastjson.JSONObject;
 import com.github.pagehelper.PageInfo;
 import lombok.extern.slf4j.Slf4j;
 import net.coobird.thumbnailator.Thumbnails;
+import org.jsoup.Connection;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
 import org.springframework.beans.factory.annotation.Value;
+import org.springframework.scheduling.annotation.Async;
 import org.springframework.stereotype.Service;
+import org.springframework.transaction.annotation.Propagation;
+import org.springframework.transaction.annotation.Transactional;
+import org.springframework.util.StopWatch;
 import org.springframework.web.multipart.MultipartFile;
 import top.lvzhiqiang.config.InitRunner;
 import top.lvzhiqiang.dto.R;
+import top.lvzhiqiang.entity.FileCrawlerImage;
+import top.lvzhiqiang.entity.FileCrawlerImageLog;
 import top.lvzhiqiang.entity.FileImage;
 import top.lvzhiqiang.enumeration.ResultCodeEnum;
 import top.lvzhiqiang.exception.BusinessException;
 import top.lvzhiqiang.mapper.PictureInfoMapper;
 import top.lvzhiqiang.service.PictureInfoService;
-import top.lvzhiqiang.util.DateUtils;
-import top.lvzhiqiang.util.FtpUtil;
-import top.lvzhiqiang.util.StringUtils;
+import top.lvzhiqiang.util.*;
 
 import javax.annotation.Resource;
 import javax.imageio.ImageIO;
@@ -27,8 +36,11 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.math.BigDecimal;
 import java.math.RoundingMode;
+import java.net.Proxy;
+import java.net.URLDecoder;
 import java.time.LocalDate;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
@@ -46,6 +58,7 @@ public class PictureInfoServiceImpl extends BaseServiceImpl<Object> implements P
     private PictureInfoMapper pictureInfoMapper;
     @Value("${spring.profiles.active}")
     private String env;
+    private final String parentPath = LocalDate.now().format(DateUtils.dateFormatter5);
 
     @Override
     public PageInfo<FileImage> getPictureInfoPage(Map<String, Object> params) {
@@ -207,4 +220,137 @@ public class PictureInfoServiceImpl extends BaseServiceImpl<Object> implements P
         }
     }
 
+    @Override
+    @Async
+    public void jsoupFulibaPic(Object o, Object o1) throws Exception {
+        log.warn("jsoupFulibaPic 开始:");
+        StopWatch stopWatch = new StopWatch();
+        stopWatch.start();
+
+        FileCrawlerImage latestFileCrawlerImage = pictureInfoMapper.findLatestCrawlerImage(1);
+        LocalDate latestDate;
+        if (latestFileCrawlerImage == null) {
+            latestDate = LocalDate.of(1970, 1, 1);
+        } else {
+            latestDate = latestFileCrawlerImage.getPublishTime();
+        }
+
+        String crawlerFulibaUrl = InitRunner.dicCodeMap.get("crawler_fuliba_url").getCodeValue();
+        Map<String, String> headerMap = new HashMap<>();
+        headerMap.put("referer", crawlerFulibaUrl);
+        Document fulibaDocument;
+
+        outer:
+        while (true) {
+            fulibaDocument = JsoupUtil.requestDocument(crawlerFulibaUrl, JsoupUtil.HTTP_GET, Proxy.NO_PROXY, null, headerMap, null);
+            log.warn("jsoupFulibaPic page success:url={}", crawlerFulibaUrl);
+
+            Elements sourceSelects = fulibaDocument.select(".content").select("article.excerpt");
+            for (Element sourceSelect : sourceSelects) {
+                Thread.sleep(2000L);
+                String mainUrl = sourceSelect.select("header").select("a").attr("abs:href");
+                mainUrl = URLDecoder.decode(mainUrl, "UTF-8");
+
+                String publishTimeStr = sourceSelect.select("div.meta").select("time").text();
+                LocalDate publishTime = LocalDate.parse(publishTimeStr, DateUtils.dateFormatter);
+                if (publishTime.isBefore(latestDate) || publishTime.isEqual(latestDate)) {
+                    break outer;
+                }
+
+                FileCrawlerImageLog crawlerImageLog = new FileCrawlerImageLog();
+                crawlerImageLog.setId(UUIDUtils.getUUID());
+                crawlerImageLog.setMainUrl(mainUrl);
+                crawlerImageLog.setStatus(1);
+                try {
+                    String mainTitle = SpringUtils.getBean(PictureInfoServiceImpl.class).jsoupFulibaPicSub(mainUrl, headerMap, publishTime, crawlerImageLog.getId());
+                    crawlerImageLog.setMainTitle(mainTitle);
+                } catch (Exception e) {
+                    crawlerImageLog.setFailureCause(e.getMessage());
+                    crawlerImageLog.setStatus(2);
+                } finally {
+                    pictureInfoMapper.insertOrUpdateFileCrawlerImageLog(crawlerImageLog);
+                }
+            }
+
+            // 继续下一页
+            Elements nextSelects = fulibaDocument.select("div.pagination > ul").select("li.next-page");
+            if (!nextSelects.isEmpty()) {
+                crawlerFulibaUrl = nextSelects.get(0).select("a").attr("abs:href");
+            } else {
+                break;
+            }
+        }
+
+        log.warn("jsoupFulibaPic 结束:time={}", stopWatch.getTotalTimeSeconds());
+    }
+
+    @Override
+    @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
+    public String jsoupFulibaPicSub(String mainUrl, Map<String, String> headerMap, LocalDate publishTime, String logId) {
+        String newName;
+        String imageUrl;
+        String imageSize;
+        String mainTitle;
+        Document fulibaDetailDocument;
+        String ftpImageCrawlerBasePath = InitRunner.dicCodeMap.get("ftp_image_crawler_basepath").getCodeValue();
+        String ftpThumbnailCrawlerBasePath = InitRunner.dicCodeMap.get("ftp_thumbnail_crawler_basepath").getCodeValue();
+        List<String> delPathList = new ArrayList<>();
+        String srcUrl = "";
+        try {
+            fulibaDetailDocument = JsoupUtil.requestDocument(mainUrl, JsoupUtil.HTTP_GET, Proxy.NO_PROXY, null, headerMap, null);
+            log.warn("jsoupFulibaPicSub detail success:url={},logId={}", mainUrl, logId);
+            Elements imgEles = fulibaDetailDocument.select("div.content > article.article-content").select("img");
+            mainTitle = fulibaDetailDocument.select("div.content > header.article-header > h1.article-title").select("a").text();
+            for (Element imgEle : imgEles) {
+
+                srcUrl = imgEle.attr("src");
+                String altTitle = imgEle.attr("alt");
+
+                newName = FtpUtil.genImageName();
+                String prefx = srcUrl.substring(srcUrl.lastIndexOf("."));
+                newName = newName + prefx;
+
+                Connection.Response response = Jsoup.connect(srcUrl).method(Connection.Method.GET).ignoreContentType(true).timeout(50 * 1000).execute();
+
+                FileCrawlerImage fileCrawlerImage = new FileCrawlerImage();
+                fileCrawlerImage.setOldName(altTitle);
+                fileCrawlerImage.setNewName(newName);
+
+                byte[] imageBytes = response.bodyAsBytes();
+                imageSize = BigDecimal.valueOf(imageBytes.length).divide(new BigDecimal("1024")).setScale(0, RoundingMode.UP).toPlainString().concat("KB");
+                fileCrawlerImage.setSize(imageSize);
+
+                imageUrl = parentPath + "/" + newName;
+                fileCrawlerImage.setPath(imageUrl);
+
+                fileCrawlerImage.setRemark("");
+                fileCrawlerImage.setCategoryId(1L);
+                fileCrawlerImage.setOrginUrl(srcUrl);
+                fileCrawlerImage.setLogId(logId);
+                fileCrawlerImage.setPublishTime(publishTime);
+                int count = pictureInfoMapper.insertIgnoreFileImage(fileCrawlerImage);
+                if (count > 0) {
+                    InputStream imageStream1 = new ByteArrayInputStream(imageBytes);
+                    FtpUtil.uploadFile(ftpImageCrawlerBasePath, parentPath, newName, imageStream1);
+                    delPathList.add(ftpImageCrawlerBasePath + imageUrl);
+
+                    InputStream imageStream2 = new ByteArrayInputStream(imageBytes);
+                    ByteArrayOutputStream thumbnailOutputStream = new ByteArrayOutputStream();
+                    Thumbnails.of(imageStream2).size(300, 200).toOutputStream(thumbnailOutputStream);
+                    ByteArrayInputStream thumbnailInputStream = new ByteArrayInputStream(thumbnailOutputStream.toByteArray());
+                    FtpUtil.uploadFile(ftpThumbnailCrawlerBasePath, parentPath, newName, thumbnailInputStream);
+                    delPathList.add(ftpThumbnailCrawlerBasePath + imageUrl);
+                }
+            }
+        } catch (Exception e) {
+            // 异常,删除已经上传的文件
+            if (delPathList.size() > 0) {
+                delPathList.stream().forEach(delPath -> FtpUtil.delFile(delPath));
+            }
+
+            log.error("jsoupFulibaPicSub exception,mainUrl={},publishTime={},srcUrl={}", mainUrl, publishTime, srcUrl, e);
+            throw new BusinessException(30000, e.getMessage());
+        }
+        return mainTitle;
+    }
 }

+ 25 - 0
src/main/java/top/lvzhiqiang/util/UUIDUtils.java

@@ -0,0 +1,25 @@
+package top.lvzhiqiang.util;
+
+import java.util.UUID;
+
+/**
+ * UUID工具类
+ *
+ * @author shiyong
+ * 2019-12-17 19:36
+ */
+public class UUIDUtils {
+
+    /**
+     * 生成UUID
+     * @return java.lang.String
+     * @author shiyong
+     * 2019/10/23 10:21
+     */
+    public static String getUUID() {
+        String uuid = UUID.randomUUID().toString();
+
+        //去掉“-”符号
+        return uuid.replaceAll( "-", "");
+    }
+}

+ 7 - 0
src/main/resources/static/crawler.html

@@ -275,6 +275,13 @@
             <input type="submit" value="提交">
         </form>
     </div>
+
+    <div style="margin-right:20px;">
+        <span class="font">jsoupFulibaPic</span>
+        <form method="post" action="pictureInfo/jsoupFulibaPic">
+            <input type="submit" value="提交">
+        </form>
+    </div>
 </div>
 </body>
 </html>

+ 39 - 0
src/test/java/top/lvzhiqiang/TestPicture.java

@@ -0,0 +1,39 @@
+package top.lvzhiqiang;
+
+import lombok.extern.slf4j.Slf4j;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.springframework.boot.test.context.SpringBootTest;
+import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
+import top.lvzhiqiang.service.PictureInfoService;
+
+import javax.annotation.Resource;
+
+/**
+ * 单元测试类
+ *
+ * @author lvzhiqiang
+ * @since 11:19 2022/5/2
+ */
+@Slf4j
+@RunWith(SpringJUnit4ClassRunner.class)
+@SpringBootTest(properties = {
+        "spring.profiles.active=dev",
+        "logging.level.top.lvzhiqiang=DEBUG"
+}
+)
+public class TestPicture {
+
+    @Resource
+    private PictureInfoService pictureInfoService;
+
+    /**
+     * jsoup fuliba
+     */
+    @Test
+    public void testJsoupFulibaPicJob() throws Exception {
+        log.warn("jsoupFulibaPicJob开始==============================");
+
+        pictureInfoService.jsoupFulibaPic(null, null);
+    }
+}