|
@@ -0,0 +1,307 @@
|
|
|
|
|
+package top.lvzhiqiang.service.impl;
|
|
|
|
|
+
|
|
|
|
|
+import java.io.BufferedReader;
|
|
|
|
|
+import java.io.InputStreamReader;
|
|
|
|
|
+import java.net.HttpURLConnection;
|
|
|
|
|
+import java.net.InetSocketAddress;
|
|
|
|
|
+import java.net.Proxy;
|
|
|
|
|
+import java.net.URL;
|
|
|
|
|
+import java.nio.file.Paths;
|
|
|
|
|
+import java.util.Arrays;
|
|
|
|
|
+import java.util.Base64;
|
|
|
|
|
+import java.util.HashMap;
|
|
|
|
|
+import java.util.List;
|
|
|
|
|
+import java.util.Map;
|
|
|
|
|
+
|
|
|
|
|
+import javax.annotation.PostConstruct;
|
|
|
|
|
+import javax.annotation.PreDestroy;
|
|
|
|
|
+import javax.annotation.Resource;
|
|
|
|
|
+
|
|
|
|
|
+import org.jsoup.Connection;
|
|
|
|
|
+import org.jsoup.Jsoup;
|
|
|
|
|
+import org.jsoup.nodes.Document;
|
|
|
|
|
+import org.springframework.beans.factory.annotation.Value;
|
|
|
|
|
+import org.springframework.stereotype.Component;
|
|
|
|
|
+
|
|
|
|
|
+import com.alibaba.fastjson.JSONObject;
|
|
|
|
|
+import com.microsoft.playwright.Browser;
|
|
|
|
|
+import com.microsoft.playwright.BrowserContext;
|
|
|
|
|
+import com.microsoft.playwright.BrowserType;
|
|
|
|
|
+import com.microsoft.playwright.Page;
|
|
|
|
|
+import com.microsoft.playwright.Playwright;
|
|
|
|
|
+
|
|
|
|
|
+import lombok.extern.slf4j.Slf4j;
|
|
|
|
|
+import top.lvzhiqiang.entity.DicCode;
|
|
|
|
|
+import top.lvzhiqiang.exception.BusinessException;
|
|
|
|
|
+import top.lvzhiqiang.mapper.DicCodeMapper;
|
|
|
|
|
+import top.lvzhiqiang.service.ScraperService;
|
|
|
|
|
+import top.lvzhiqiang.util.JsoupUtil;
|
|
|
|
|
+
|
|
|
|
|
+/**
|
|
|
|
|
+ * 抽象基类:负责 Playwright 的生命周期管理、防反爬配置、Session 自动维护
|
|
|
|
|
+ *
|
|
|
|
|
+ * @author: lvzhiqiang
|
|
|
|
|
+ * @date: 2026/2/11 13:54
|
|
|
|
|
+ */
|
|
|
|
|
+@Slf4j
|
|
|
|
|
+@Component
|
|
|
|
|
+public abstract class AbstractPlaywrightService implements ScraperService {
|
|
|
|
|
+ // Playwright 对象池 (复用)
|
|
|
|
|
+ protected Playwright playwright;
|
|
|
|
|
+ protected Browser browser;
|
|
|
|
|
+ protected BrowserContext context;
|
|
|
|
|
+ protected Page page;
|
|
|
|
|
+ protected Proxy proxy;
|
|
|
|
|
+ protected Map<String, DicCode> codeConstantMap;
|
|
|
|
|
+
|
|
|
|
|
+ @Value("${spring.profiles.active}")
|
|
|
|
|
+ private String env;
|
|
|
|
|
+
|
|
|
|
|
+ @Resource
|
|
|
|
|
+ private DicCodeMapper dicCodeMapper;
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 由子类提供:登录页 URL
|
|
|
|
|
+ *
|
|
|
|
|
+ * @return
|
|
|
|
|
+ */
|
|
|
|
|
+ protected abstract String getLoginUrl();
|
|
|
|
|
+
|
|
|
|
|
+ protected abstract String getPageDocumentSelector();
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 由子类提供:具体的登录动作 (填表、验证码)
|
|
|
|
|
+ *
|
|
|
|
|
+ * @return
|
|
|
|
|
+ * @throws Exception
|
|
|
|
|
+ */
|
|
|
|
|
+ protected abstract boolean doLoginAction() throws Exception;
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 由子类提供:判断当前页面是否已登录 (例如检查右上角有没有头像)
|
|
|
|
|
+ *
|
|
|
|
|
+ * @return
|
|
|
|
|
+ */
|
|
|
|
|
+ protected abstract boolean isLoginSuccess();
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * Spring 启动时初始化 Playwright 浏览器实例 (或者使用懒加载)
|
|
|
|
|
+ * // 只要服务没挂,bean 还在,browser 就在
|
|
|
|
|
+ * // 如果服务重启了,bean 重新初始化,browser 也是新的
|
|
|
|
|
+ * // 第一次调用 getPageDocumentSecurely 时,
|
|
|
|
|
+ * // 检测到 URL 跳转到了 /login (因为新浏览器没 cookie),会触发 doLoginAction
|
|
|
|
|
+ * // 登录成功后,继续抓取。
|
|
|
|
|
+ *
|
|
|
|
|
+ */
|
|
|
|
|
+ @PostConstruct
|
|
|
|
|
+ public void init() {
|
|
|
|
|
+ // 1. 初始化 Playwright
|
|
|
|
|
+ log.info("正在初始化 Playwright 浏览器内核...");
|
|
|
|
|
+ playwright = Playwright.create();
|
|
|
|
|
+
|
|
|
|
|
+ // 智能判断环境:如果有图形界面或者是开发环境,开启有头模式调试;否则无头模式
|
|
|
|
|
+ // 这里简单粗暴一点,生产环境通常没有 DISPLAY 变量,或者你可以通过配置文件传入
|
|
|
|
|
+ boolean isDev = "dev".equals(env);
|
|
|
|
|
+
|
|
|
|
|
+ // 准备启动参数(服务器防崩溃三件套)
|
|
|
|
|
+ List<String> launchArgs = Arrays.asList(
|
|
|
|
|
+ "--no-sandbox", // 允许 root 用户运行(CentOS 默认是用 root 的多)
|
|
|
|
|
+ "--disable-setuid-sandbox", // 禁用 setuid 沙箱,防止某些内核安全机制拦截
|
|
|
|
|
+ "--disable-dev-shm-usage", // 【关键】防止内存不足导致崩溃。Linux 默认的 /dev/shm(共享内存)非常小(64MB),Chrome 这种吃内存大户一旦页面复杂点(比如
|
|
|
|
|
+ // JavDB 的图片多),就会因为共享内存不足而崩溃(Crashpad 错误)。加上这个参数,它就会使用 /tmp 目录,空间管够。
|
|
|
|
|
+ "--disable-gpu", // 服务器没显卡,关掉省资源
|
|
|
|
|
+ "--disable-blink-features=AutomationControlled" // 去除自动化特征
|
|
|
|
|
+ );
|
|
|
|
|
+
|
|
|
|
|
+ // 2. 启动浏览器
|
|
|
|
|
+ log.info("正在启动浏览器 (模式: {} )...", isDev ? "有头调试" : "无头生产");
|
|
|
|
|
+ browser = playwright.chromium().launch(new BrowserType.LaunchOptions()
|
|
|
|
|
+ .setHeadless(!isDev) // 生产环境 true (无头),开发环境 false (有头)
|
|
|
|
|
+ .setChannel("chrome") // 尽量使用本机 Chrome,抗指纹能力更强
|
|
|
|
|
+ .setArgs(launchArgs) // 使用 Arrays.asList 生成的参数
|
|
|
|
|
+ );
|
|
|
|
|
+
|
|
|
|
|
+ // 3. 创建上下文 (设置 UserAgent 和 视窗大小,伪装成普通 Mac 用户)
|
|
|
|
|
+ context = browser.newContext(new Browser.NewContextOptions()
|
|
|
|
|
+ .setUserAgent(JsoupUtil.getUserAgent())
|
|
|
|
|
+ .setViewportSize(1920, 1080)); // 防止响应式布局陷阱+反爬指纹
|
|
|
|
|
+
|
|
|
|
|
+ // 增加全局默认超时时间为 60秒,防止 CF 盾加载慢
|
|
|
|
|
+ context.setDefaultTimeout(60000);
|
|
|
|
|
+ page = context.newPage();
|
|
|
|
|
+
|
|
|
|
|
+ // 4. 设置代理模式
|
|
|
|
|
+ if ("dev".equals(env)) {
|
|
|
|
|
+ proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("127.0.0.1", 7897));
|
|
|
|
|
+ } else {
|
|
|
|
|
+ proxy = Proxy.NO_PROXY;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 5. 其他字典参数
|
|
|
|
|
+ codeConstantMap = dicCodeMapper.findAllMapByEnv(env);
|
|
|
|
|
+
|
|
|
|
|
+ log.info("浏览器初始化完成。");
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ @PreDestroy
|
|
|
|
|
+ public void cleanup() {
|
|
|
|
|
+ if (browser != null)
|
|
|
|
|
+ browser.close();
|
|
|
|
|
+ if (playwright != null)
|
|
|
|
|
+ playwright.close();
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 核心登录方法(带重试机制),返回是否成功
|
|
|
|
|
+ */
|
|
|
|
|
+ public boolean login() {
|
|
|
|
|
+ int maxRetries = 3;
|
|
|
|
|
+ for (int i = 0; i < maxRetries; i++) {
|
|
|
|
|
+ try {
|
|
|
|
|
+ // 先去登录页
|
|
|
|
|
+ page.navigate(getLoginUrl());
|
|
|
|
|
+ log.info("尝试登录{}... 第 {} 次", getLoginUrl(), (i + 1));
|
|
|
|
|
+ // 执行子类的动作
|
|
|
|
|
+ if (doLoginAction()) {
|
|
|
|
|
+ return true;
|
|
|
|
|
+ }
|
|
|
|
|
+ } catch (Exception e) {
|
|
|
|
|
+ log.error("登录超时,可能原因:验证码错误或账号问题。当前页面 URL: {}", page.url(), e);
|
|
|
|
|
+
|
|
|
|
|
+ // 截图保留现场,方便排查
|
|
|
|
|
+ try {
|
|
|
|
|
+ page.screenshot(new Page.ScreenshotOptions().setPath(Paths.get("login_error_" + i + ".png")));
|
|
|
|
|
+ } catch (Exception ignored) {
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ // 失败后刷新页面重试
|
|
|
|
|
+ page.reload();
|
|
|
|
|
+ }
|
|
|
|
|
+ return false;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 【核心方法】获取页面源码 (带 Session 检查机制)
|
|
|
|
|
+ * 所有的业务抓取都要调用这个方法,不要直接用 page.navigate
|
|
|
|
|
+ */
|
|
|
|
|
+ @Override
|
|
|
|
|
+ public Document getPageDocumentSecurely(String targetUrl) {
|
|
|
|
|
+ log.info("准备访问: {}", targetUrl);
|
|
|
|
|
+
|
|
|
|
|
+ // 1. 发起访问
|
|
|
|
|
+ page.navigate(targetUrl);
|
|
|
|
|
+
|
|
|
|
|
+ // 2. 检查:我是不是被重定向到登录页了? 或者 Cookie 过期了?
|
|
|
|
|
+ // 这里只是一个简单的判断示例,根据 URL 或页面特征判断
|
|
|
|
|
+ if (page.url().contains("/login") || !isLoginSuccess()) {
|
|
|
|
|
+ log.warn("检测到 Session 过期或未登录,触发自动登录流程...");
|
|
|
|
|
+
|
|
|
|
|
+ // 执行登录
|
|
|
|
|
+ if (login()) {
|
|
|
|
|
+ log.info("登录成功,重新访问目标页面...{}", targetUrl);
|
|
|
|
|
+ // 登录后,带上新的 Cookie 重新访问刚才失败的链接
|
|
|
|
|
+ page.navigate(targetUrl);
|
|
|
|
|
+ } else {
|
|
|
|
|
+ throw new BusinessException("自动登录失败,无法抓取: " + targetUrl);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 3. 等待内容加载 (子类可以通过 override 调整选择器)
|
|
|
|
|
+ try {
|
|
|
|
|
+ page.waitForSelector(getPageDocumentSelector(), new Page.WaitForSelectorOptions().setTimeout(30000));
|
|
|
|
|
+ } catch (Exception e) {
|
|
|
|
|
+ log.error("页面加载超时或核心元素未找到: {}", e.getMessage());
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 4. 返回 Jsoup 文档
|
|
|
|
|
+ // 把字符串喂给 Jsoup,而不是让 Jsoup 去联网
|
|
|
|
|
+ return Jsoup.parse(page.content());
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ protected String getCode(String captchaImgUrl) throws Exception {
|
|
|
|
|
+ String ocrAccurateBasicUrl = codeConstantMap.get("bd_ocr_url").getCodeValue();
|
|
|
|
|
+ String accessToken = getAuth( codeConstantMap.get("bd_ak").getCodeValue(), codeConstantMap.get("bd_sk").getCodeValue());
|
|
|
|
|
+ Map<String, String> headerParams = new HashMap<>();
|
|
|
|
|
+ Map<String, String> params = new HashMap<>();
|
|
|
|
|
+ headerParams.put("Content-Type", "application/x-www-form-urlencoded");
|
|
|
|
|
+
|
|
|
|
|
+ Connection.Response imgResponse = JsoupUtil.requestBody(captchaImgUrl, JsoupUtil.HTTP_GET, proxy, null,
|
|
|
|
|
+ headerParams, null);
|
|
|
|
|
+ byte[] imgBytes = imgResponse.bodyAsBytes();
|
|
|
|
|
+ params.put("image", Base64.getEncoder().encodeToString(imgBytes));
|
|
|
|
|
+ Connection.Response ocrResponse = JsoupUtil.requestBody(
|
|
|
|
|
+ ocrAccurateBasicUrl.concat("?access_token=").concat(accessToken),
|
|
|
|
|
+ JsoupUtil.HTTP_POST, Proxy.NO_PROXY, headerParams, params);
|
|
|
|
|
+ JSONObject crAccurateBasicResult = JSONObject.parseObject(ocrResponse.body());
|
|
|
|
|
+ String seccodeverify = crAccurateBasicResult.getJSONArray("words_result").getJSONObject(0).getString("words");
|
|
|
|
|
+ return seccodeverify;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ protected String getCode(byte[] captchaImgBytes) throws Exception {
|
|
|
|
|
+ String ocrAccurateBasicUrl = codeConstantMap.get("bd_ocr_url").getCodeValue();
|
|
|
|
|
+ String accessToken = getAuth(codeConstantMap.get("bd_ak").getCodeValue(), codeConstantMap.get("bd_sk").getCodeValue());
|
|
|
|
|
+ Map<String, String> headerParams = new HashMap<>();
|
|
|
|
|
+ Map<String, String> params = new HashMap<>();
|
|
|
|
|
+ headerParams.put("Content-Type", "application/x-www-form-urlencoded");
|
|
|
|
|
+
|
|
|
|
|
+ params.put("image", Base64.getEncoder().encodeToString(captchaImgBytes));
|
|
|
|
|
+ Connection.Response ocrResponse = JsoupUtil.requestBody(
|
|
|
|
|
+ ocrAccurateBasicUrl.concat("?access_token=").concat(accessToken),
|
|
|
|
|
+ JsoupUtil.HTTP_POST, Proxy.NO_PROXY, headerParams, params);
|
|
|
|
|
+ JSONObject crAccurateBasicResult = JSONObject.parseObject(ocrResponse.body());
|
|
|
|
|
+ String seccodeverify = crAccurateBasicResult.getJSONArray("words_result").getJSONObject(0).getString("words");
|
|
|
|
|
+ return seccodeverify;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 获取API访问token
|
|
|
|
|
+ * 该token有一定的有效期,需要自行管理,当失效时需重新获取.
|
|
|
|
|
+ *
|
|
|
|
|
+ * @param ak - 百度云官网获取的 API Key
|
|
|
|
|
+ * @param sk - 百度云官网获取的 Securet Key
|
|
|
|
|
+ * @return assess_token 示例:
|
|
|
|
|
+ * "24.460da4889caad24cccdb1fea17221975.2592000.1491995545.282335-1234567"
|
|
|
|
|
+ */
|
|
|
|
|
+ private String getAuth(String ak, String sk) {
|
|
|
|
|
+ // 获取token地址
|
|
|
|
|
+ String authHost = codeConstantMap.get("bd_authhost_url").getCodeValue();
|
|
|
|
|
+ String getAccessTokenUrl = authHost
|
|
|
|
|
+ // 1. grant_type为固定参数
|
|
|
|
|
+ + "grant_type=client_credentials"
|
|
|
|
|
+ // 2. 官网获取的 API Key
|
|
|
|
|
+ + "&client_id=" + ak
|
|
|
|
|
+ // 3. 官网获取的 Secret Key
|
|
|
|
|
+ + "&client_secret=" + sk;
|
|
|
|
|
+ try {
|
|
|
|
|
+ URL realUrl = new URL(getAccessTokenUrl);
|
|
|
|
|
+ // 打开和URL之间的连接
|
|
|
|
|
+ HttpURLConnection connection = (HttpURLConnection) realUrl.openConnection();
|
|
|
|
|
+ connection.setRequestMethod("GET");
|
|
|
|
|
+ connection.connect();
|
|
|
|
|
+ // 获取所有响应头字段
|
|
|
|
|
+ Map<String, List<String>> map = connection.getHeaderFields();
|
|
|
|
|
+ // 遍历所有的响应头字段
|
|
|
|
|
+ for (String key : map.keySet()) {
|
|
|
|
|
+ System.err.println(key + "--->" + map.get(key));
|
|
|
|
|
+ }
|
|
|
|
|
+ // 定义 BufferedReader输入流来读取URL的响应
|
|
|
|
|
+ BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()));
|
|
|
|
|
+ String result = "";
|
|
|
|
|
+ String line;
|
|
|
|
|
+ while ((line = in.readLine()) != null) {
|
|
|
|
|
+ result += line;
|
|
|
|
|
+ }
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 返回结果示例
|
|
|
|
|
+ */
|
|
|
|
|
+ System.err.println("result:" + result);
|
|
|
|
|
+ JSONObject jsonObject = JSONObject.parseObject(result);
|
|
|
|
|
+ String access_token = jsonObject.getString("access_token");
|
|
|
|
|
+ return access_token;
|
|
|
|
|
+ } catch (Exception e) {
|
|
|
|
|
+ System.err.print("获取token失败!");
|
|
|
|
|
+ e.printStackTrace(System.err);
|
|
|
|
|
+ }
|
|
|
|
|
+ return null;
|
|
|
|
|
+ }
|
|
|
|
|
+}
|