Kaynağa Gözat

update:javdb演员爬取采用新方式v2

lvzhiqiang 1 ay önce
ebeveyn
işleme
1fadd15953

+ 16 - 1
pom.xml

@@ -106,7 +106,16 @@
         <dependency>
             <groupId>com.microsoft.playwright</groupId>
             <artifactId>playwright</artifactId>
-            <version>1.49.0</version>
+            <!--centos7系统需要降级到 v1.37.0 或更低(比如 v1.30.0),centos8可以用1.49.0以上。-->
+            <!--CentOS 7 默认的 glibc 版本是 2.17,而 Playwright v1.49 自带的 Node.js 需要 glibc 2.27+。-->
+            <!--运行你的 jar 包,它第一次启动会自动下载 chromium (~130MB) 到 ~/.cache/ms-playwright-->
+            <version>1.30.0</version>
+            <exclusions>
+                <exclusion>
+                    <groupId>com.microsoft.playwright</groupId>
+                    <artifactId>driver-bundle</artifactId>
+                </exclusion>
+            </exclusions>
         </dependency>
 
         <dependency>
@@ -224,6 +233,12 @@
             <plugin>
                 <groupId>org.springframework.boot</groupId>
                 <artifactId>spring-boot-maven-plugin</artifactId>
+                <configuration>
+                    <!--JAR (默认): 使用 JarLauncher。它只能加载 JAR 包内部 BOOT-INF/lib 下的依赖,不支持外部依赖加载。-->
+                    <!--ZIP: 使用 PropertiesLauncher。这是 Spring Boot 专门设计用来支持 “从外部路径加载依赖” 的启动器。-->
+                    <!--当你配置了 <layout>ZIP</layout> 后,Maven 插件在打包时会自动修改 JAR 包里的 META-INF/MANIFEST.MF 文件,把 Main-Class 从默认的 JarLauncher 改为 PropertiesLauncher-->
+                    <layout>ZIP</layout>
+                </configuration>
             </plugin>
         </plugins>
     </build>

+ 39 - 31
src/main/java/top/lvzhiqiang/service/impl/AbstractPlaywrightService.java

@@ -1,42 +1,36 @@
 package top.lvzhiqiang.service.impl;
 
-import java.io.BufferedReader;
-import java.io.InputStreamReader;
-import java.net.HttpURLConnection;
-import java.net.InetSocketAddress;
-import java.net.Proxy;
-import java.net.URL;
-import java.nio.file.Paths;
-import java.util.Arrays;
-import java.util.Base64;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import javax.annotation.PostConstruct;
-import javax.annotation.PreDestroy;
-import javax.annotation.Resource;
-
+import com.alibaba.fastjson.JSONObject;
+import com.microsoft.playwright.*;
+import lombok.extern.slf4j.Slf4j;
 import org.jsoup.Connection;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.springframework.beans.factory.annotation.Value;
+import org.springframework.boot.system.ApplicationHome;
 import org.springframework.stereotype.Component;
-
-import com.alibaba.fastjson.JSONObject;
-import com.microsoft.playwright.Browser;
-import com.microsoft.playwright.BrowserContext;
-import com.microsoft.playwright.BrowserType;
-import com.microsoft.playwright.Page;
-import com.microsoft.playwright.Playwright;
-
-import lombok.extern.slf4j.Slf4j;
 import top.lvzhiqiang.entity.DicCode;
 import top.lvzhiqiang.exception.BusinessException;
 import top.lvzhiqiang.mapper.DicCodeMapper;
 import top.lvzhiqiang.service.ScraperService;
 import top.lvzhiqiang.util.JsoupUtil;
 
+import javax.annotation.PostConstruct;
+import javax.annotation.PreDestroy;
+import javax.annotation.Resource;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.net.HttpURLConnection;
+import java.net.InetSocketAddress;
+import java.net.Proxy;
+import java.net.URL;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.time.LocalDateTime;
+import java.time.format.DateTimeFormatter;
+import java.util.*;
+
 /**
  * 抽象基类:负责 Playwright 的生命周期管理、防反爬配置、Session 自动维护
  *
@@ -117,7 +111,11 @@ public abstract class AbstractPlaywrightService implements ScraperService {
         log.info("正在启动浏览器 (模式: {} )...", isDev ? "有头调试" : "无头生产");
         browser = playwright.chromium().launch(new BrowserType.LaunchOptions()
                 .setHeadless(!isDev) // 生产环境 true (无头),开发环境 false (有头)
-                .setChannel("chrome") // 尽量使用本机 Chrome,抗指纹能力更强
+                //它的含义是: 告诉 Playwright:“不要用你自带的 Chromium,去 Linux 系统里找一个已经安装好的 Google Chrome 浏览器(通常在 /opt/google/chrome/chrome)来启动。”
+                //你的现状是: 你的 CentOS 服务器是裸机,并没有去官网下载安装 Google Chrome 的 RPM 包(而且在 CentOS 7 上装 Chrome 非常麻烦,全是依赖地狱)。
+                //我们需要让 Playwright 使用它自己下载的 Chromium(它自己带的浏览器是绿色免安装的,就在 ~/.cache/ms-playwright 目录下),而不是去调系统的 Chrome。
+                //删除 或 注释掉 .setChannel("chrome") 这一行。
+                //.setChannel("chrome") // 尽量使用本机 Chrome,抗指纹能力更强
                 .setArgs(launchArgs) // 使用 Arrays.asList 生成的参数
         );
 
@@ -155,11 +153,11 @@ public abstract class AbstractPlaywrightService implements ScraperService {
      * 核心登录方法(带重试机制),返回是否成功
      */
     public boolean login() {
-        int maxRetries = 3;
+        int maxRetries = 5;
+        // 先去登录页
+        page.navigate(getLoginUrl());
         for (int i = 0; i < maxRetries; i++) {
             try {
-                // 先去登录页
-                page.navigate(getLoginUrl());
                 log.info("尝试登录{}... 第 {} 次", getLoginUrl(), (i + 1));
                 // 执行子类的动作
                 if (doLoginAction()) {
@@ -170,7 +168,17 @@ public abstract class AbstractPlaywrightService implements ScraperService {
 
                 // 截图保留现场,方便排查
                 try {
-                    page.screenshot(new Page.ScreenshotOptions().setPath(Paths.get("login_error_" + i + ".png")));
+                    // 1. 获取 JAR 包所在的绝对路径 (Spring Boot 专属神器)
+                    String jarPath = new ApplicationHome(getClass()).getSource().getParent();
+                    // 2. 构造子文件夹路径 (例如: /usr/program/jav/images)
+                    Path dirPath = Paths.get(jarPath, "files/playwright"); // 假设子文件夹叫 images
+                    // 3. 关键一步:如果文件夹不存在,必须先创建!
+                    if (!Files.exists(dirPath)) Files.createDirectories(dirPath);
+                    // 4. 拼接完整的文件路径
+                    String timeStr = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss"));
+                    Path fullPath = dirPath.resolve("login_error_" + timeStr + "_" + i + ".png");
+
+                    page.screenshot(new Page.ScreenshotOptions().setPath(fullPath));
                 } catch (Exception ignored) {
                 }
             }

+ 2 - 2
src/main/java/top/lvzhiqiang/service/impl/Crawler4JavdbServiceImpl.java

@@ -128,7 +128,7 @@ public class Crawler4JavdbServiceImpl implements Crawler4JavdbService {
                     insertCodeList.add(code);
                     XxlJobHelper.log("videoMonitorActors insertCodeList code={} add", code);
 
-                    String url = itembSelect.select("a.box").get(0).attr("abs:href");
+                    String url = itembSelect.select("a.box").get(0).attr("href");
                     String title = itembSelect.select("a.box").get(0).attr("title");
                     String score = itembSelect.select("a.box").get(0).select("div.score").text().replace("&nbsp;", "").trim();
                     String date = itembSelect.select("a.box").get(0).select("div.meta").text().trim();
@@ -143,7 +143,7 @@ public class Crawler4JavdbServiceImpl implements Crawler4JavdbService {
 
                     JSONObject params = new JSONObject();
                     params.put("title", "JAVDB演员监控报警");
-                    params.put("logUrl", url);
+                    params.put("logUrl", "https://javdb.com" + url);
                     params.put("btnTxt", "影片详情");
                     monitorAlarm4APP_TEXT_CARD(content, params);
 

+ 20 - 15
src/main/java/top/lvzhiqiang/service/impl/JavdbPlaywrightServiceImpl.java

@@ -1,17 +1,16 @@
 package top.lvzhiqiang.service.impl;
 
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Paths;
-
+import com.microsoft.playwright.Locator;
+import com.microsoft.playwright.Response;
+import lombok.extern.slf4j.Slf4j;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
+import org.springframework.boot.system.ApplicationHome;
 import org.springframework.stereotype.Service;
 
-import com.microsoft.playwright.Locator;
-import com.microsoft.playwright.Response;
-
-import lombok.extern.slf4j.Slf4j;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 
 /**
  * 实现 JavDB 业务类
@@ -104,16 +103,22 @@ public class JavdbPlaywrightServiceImpl extends AbstractPlaywrightService {
             byte[] imgBytes = captchaResponse.body();
             log.info("成功捕获验证码图片,大小: {} bytes", imgBytes.length);
 
+            // 调用百度 OCR 接口
+            String captchaCode = getCode(imgBytes);
+            log.info("验证码识别完成,结果: {}", captchaCode);
             try {
-                Files.write(Paths.get("output.png"), imgBytes);
-                System.out.println("文件写入成功!");
-            } catch (IOException e) {
+                // 1. 获取 JAR 包所在的绝对路径 (Spring Boot 专属神器)
+                String jarPath = new ApplicationHome(getClass()).getSource().getParent();
+                // 2. 构造子文件夹路径 (例如: /usr/program/jav/images)
+                Path dirPath = Paths.get(jarPath, "files/playwright"); // 假设子文件夹叫 images
+                // 3. 关键一步:如果文件夹不存在,必须先创建!
+                if (!Files.exists(dirPath)) Files.createDirectories(dirPath);
+                // 4. 写入
+                Files.write(dirPath.resolve(captchaCode + ".png"), imgBytes);
+            } catch (Exception e) {
                 e.printStackTrace();
             }
 
-            // 调用百度 OCR 接口
-            String captchaCode = getCode(imgBytes);
-            log.info("验证码识别完成,结果: {}", captchaCode);
             page.fill("input[name='_rucaptcha']", captchaCode);
         }
 
@@ -131,7 +136,7 @@ public class JavdbPlaywrightServiceImpl extends AbstractPlaywrightService {
         // 等待 URL 变更为非 login 或 出现"我的账户"
         // 使用 try-catch 等待,如果超时说明登录可能失败(比如验证码错)
         try {
-            page.waitForURL(url -> !url.contains("/login")&& !url.contains("sessions"),
+            page.waitForURL(url -> !url.contains("/login")&& !url.contains("/user_sessions"),
                     new com.microsoft.playwright.Page.WaitForURLOptions().setTimeout(15000));
 
             log.info("登录成功!跳转到了: {}", page.url());