Browse Source

add:javbusLog处理v1

lvzhiqiang 3 years ago
parent
commit
decad5ad47

+ 14 - 0
src/main/java/top/lvzhiqiang/controller/CrawlerController.java

@@ -193,4 +193,18 @@ public class CrawlerController {
         crawler4JavbusService.jsoupJavbusProfile(start, limit);
         return "success";
     }
+
+    /**
+     * handleJavbusLog
+     *
+     * @author lvzhiqiang
+     * 2022/10/20 9:45
+     */
+    @RequestMapping("/handleJavbusLog")
+    @ResponseBody
+    public String handleJavbusLog(Integer status) throws Exception {
+
+        crawler4JavbusService.handleJavbusLog(status);
+        return "success";
+    }
 }

+ 1 - 1
src/main/java/top/lvzhiqiang/entity/CrawlerJavbusLog.java

@@ -25,7 +25,7 @@ public class CrawlerJavbusLog implements Serializable {
     private Integer type;
 
     /**
-     * 状态(1:待解决,2:已解决,3:忽略
+     * 状态(1:待解决,2:已解决,3:未解决
      */
     private Integer status;
 

+ 9 - 3
src/main/java/top/lvzhiqiang/mapper/CrawlerJavbusProfileMapper.java

@@ -77,8 +77,14 @@ public interface CrawlerJavbusProfileMapper {
      *
      * @param crawlerJavbusLog
      */
-    @Insert("INSERT INTO crawler_javbus_log(type, business_key, error_msg,create_time,modify_time) " +
-            "VALUES (#{type}, #{businessKey}, #{errorMsg}, now(), now()) " +
-            "ON DUPLICATE KEY UPDATE error_msg=values(error_msg),modify_time=now()")
+    @Insert("INSERT INTO crawler_javbus_log(type,status,business_key,error_msg,create_time,modify_time) " +
+            "VALUES (#{type}, #{status}, #{businessKey}, #{errorMsg}, now(), now()) " +
+            "ON DUPLICATE KEY UPDATE status=values(status),error_msg=values(error_msg),modify_time=now()")
     void insertOrUpdateLog(CrawlerJavbusLog crawlerJavbusLog);
+
+    /**
+     * 根据状态查询
+     */
+    @Select("SELECT * FROM crawler_javbus_log WHERE delete_flag = 1 and status = #{status}")
+    List<CrawlerJavbusLog> findJavbusLogByStatus(Integer status);
 }

+ 2 - 0
src/main/java/top/lvzhiqiang/service/Crawler4JavbusService.java

@@ -9,4 +9,6 @@ package top.lvzhiqiang.service;
 public interface Crawler4JavbusService {
 
     void jsoupJavbusProfile(Long start, Integer limit) throws Exception;
+
+    void handleJavbusLog(Integer status) throws Exception;
 }

+ 143 - 75
src/main/java/top/lvzhiqiang/service/impl/Crawler4JavbusServiceImpl.java

@@ -52,25 +52,16 @@ public class Crawler4JavbusServiceImpl implements Crawler4JavbusService {
 
     Map<String, String> javbusConstantMap = null;
     Map<String, String> javbusCookiesMap = null;
-    private String bdAccessToken = "";
+    String bdAccessToken = "";
+    Proxy proxy = null;
 
-    @Async
-    @Override
-    public void jsoupJavbusProfile(Long start, Integer limit) throws Exception {
-        log.warn("jsoupJavbusProfile 开始:start={},limit={}", start, limit);
-        StopWatch stopWatch = new StopWatch();
-        stopWatch.start();
-
-        // 获取javbus常量MAP
-        javbusConstantMap = dicCodeMapper.findAll().stream()
-                .filter(x -> "javbus".equals(x.getCodeDesc()) && x.getEnv().contains(env))
-                .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
-        // 代理及TOKEN设置
-        Proxy proxy;
-        if ("dev".equals(env)) {
-            proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("127.0.0.1", 1080));
-        } else {
-            proxy = Proxy.NO_PROXY;
+    public void beforeJavbus() throws Exception {
+        if (null == proxy) {
+            if ("dev".equals(env)) {
+                proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("127.0.0.1", 1080));
+            } else {
+                proxy = Proxy.NO_PROXY;
+            }
         }
         if (StringUtils.isEmpty(bdAccessToken)) {
             bdAccessToken = getAuth(javbusConstantMap.get("bd_ak"), javbusConstantMap.get("bd_sk"));
@@ -87,11 +78,82 @@ public class Crawler4JavbusServiceImpl implements Crawler4JavbusService {
                 throw new Exception("获取javbusCookies失败!");
             }
         }
+    }
+
+    @Async
+    @Override
+    public void jsoupJavbusProfile(Long start, Integer limit) throws Exception {
+        log.warn("jsoupJavbusProfile 开始:start={},limit={}", start, limit);
+        StopWatch stopWatch = new StopWatch();
+        stopWatch.start();
+
+        // 获取javbus常量MAP
+        javbusConstantMap = dicCodeMapper.findAll().stream()
+                .filter(x -> "javbus".equals(x.getCodeDesc()) && x.getEnv().contains(env))
+                .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
+        // 代理及TOKEN设置
+        beforeJavbus();
         // 获取个人资料
         jsoupJavbusProfileSub(proxy, start, limit);
         log.warn("jsoupJavbusProfile 结束:time={}", stopWatch.getTotalTimeSeconds());
     }
 
+    @Async
+    @Override
+    public void handleJavbusLog(Integer status) throws Exception {
+        log.warn("handleJavbusLog 开始:status={}", status);
+        StopWatch stopWatch = new StopWatch();
+        stopWatch.start();
+
+        // 获取javbus常量MAP
+        javbusConstantMap = dicCodeMapper.findAll().stream()
+                .filter(x -> "javbus".equals(x.getCodeDesc()) && x.getEnv().contains(env))
+                .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
+        // 代理及TOKEN设置
+        beforeJavbus();
+        // 获取个人资料
+        handleJavbusLogSub(status);
+        log.warn("handleJavbusLog 结束:time={}", stopWatch.getTotalTimeSeconds());
+    }
+
+    private void handleJavbusLogSub(Integer status) {
+        List<CrawlerJavbusLog> javbusLogByStatus = crawlerJavbusProfileMapper.findJavbusLogByStatus(status);
+
+        String profileUrl = "https://www.javbus.com/forum/?";
+        Document profileDocument;
+        for (CrawlerJavbusLog javbusLog : javbusLogByStatus) {
+            String uid = "";
+            String nickName = "";
+            try {
+                profileDocument = JsoupUtil.requestDocument(profileUrl.concat(javbusLog.getBusinessKey()), JsoupUtil.HTTP_GET, proxy, javbusCookiesMap, null, null);
+                if (profileDocument.html().contains("您指定的用戶空間不存在")) {
+                    log.warn("jsoupJavbusProfileSub您指定的用戶空間不存在,start={}", javbusLog.getBusinessKey());
+                    continue;
+                }
+
+                String[] mbn0Arr = profileDocument.select("div.u_profile").select("div.cl").get(0).select("h2.mbn").get(0).text().replace("(", "").replace(")", "").split("UID:");
+                nickName = mbn0Arr[0].trim();
+                uid = mbn0Arr[1].trim();
+
+                CrawlerJavbusProfile crawlerJavbusProfile = new CrawlerJavbusProfile();
+                crawlerJavbusProfile.setUid(Long.valueOf(uid));
+                crawlerJavbusProfile.setNickName(nickName);
+                parseJavbusProfile(profileDocument, crawlerJavbusProfile);
+                crawlerJavbusProfileMapper.insertOrUpdate(crawlerJavbusProfile);
+                log.warn("jsoupJavbusProfileSub成功插入,uid={}", uid);
+
+                javbusLog.setStatus(2);
+                javbusLog.setErrorMsg("");
+            } catch (Exception e) {
+                log.error("jsoupJavbusProfileSub插入异常,nickName={},uid={}", nickName, uid, e);
+                javbusLog.setStatus(3);
+                javbusLog.setErrorMsg(e.getMessage());
+            }
+
+            crawlerJavbusProfileMapper.insertOrUpdateLog(javbusLog);
+        }
+    }
+
     private void jsoupJavbusProfileSub(Proxy proxy, Long start, Integer limit) throws Exception {
         CrawlerJavbusProfile latestJavbusProfile = crawlerJavbusProfileMapper.findLatestInfo();
         if (start == null && latestJavbusProfile == null) {
@@ -130,76 +192,21 @@ public class Crawler4JavbusServiceImpl implements Crawler4JavbusService {
                     continue;
                 }
 
-                String avatarUrl = profileDocument.select("div.avt").select("img").attr("src");
                 String[] mbn0Arr = profileDocument.select("div.u_profile").select("div.cl").get(0).select("h2.mbn").get(0).text().replace("(", "").replace(")", "").split("UID:");
                 nickName = mbn0Arr[0].trim();
                 uid = mbn0Arr[1].trim();
-                String emailStatus = profileDocument.select("div.u_profile").select("div.cl").get(0).select("ul").first().text().replace("郵箱狀態", "").trim();
-
-                Elements signEles = profileDocument.select("div.u_profile").select("div.cl").get(0).select("ul").get(1).select("li:contains(個人簽名)");
-                String signStr = "";
-                ArrayList<String> signImgList = new ArrayList<>();
-                if (signEles.size() > 0) {
-                    signStr = signEles.first().select("table").text();
-                    Elements signImgEles = signEles.first().select("table").select("img");
-                    for (Element signImgEle : signImgEles) {
-                        signImgList.add(signImgEle.attr("src"));
-                    }
-                }
-
-                String friendNum = profileDocument.select("div.u_profile").select("div.cl").get(0).select("ul").get(2)
-                        .select("a").get(0).text().replace("好友數", "").trim();
-                String replyNum = profileDocument.select("div.u_profile").select("div.cl").get(0).select("ul").get(2)
-                        .select("a").get(1).text().replace("回帖數", "").trim();
-                String threadNum = profileDocument.select("div.u_profile").select("div.cl").get(0).select("ul").get(2)
-                        .select("a").get(2).text().replace("主題數", "").trim();
-                String userGroup = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(0)
-                        .select("a").text();
-                String onlineTime = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
-                        .select("li:contains(在線時間)").text().replace("在線時間", "").replace("小時", "").trim();
-                String registrationTime = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
-                        .select("li:contains(註冊時間)").text().replace("註冊時間", "").trim();
-                String lastVisit = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
-                        .select("li:contains(最後訪問)").text().replace("最後訪問", "").trim();
-                String lastActivityTime = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
-                        .select("li:contains(上次活動時間)").text().replace("上次活動時間", "").trim();
-                String lastPublishedTime = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
-                        .select("li:contains(上次發表時間)").text().replace("上次發表時間", "").trim();
-                String timeZone = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
-                        .select("li:contains(所在時區)").text().replace("所在時區", "").trim();
-                String usedSpace = profileDocument.select("div.u_profile").select("div.cl").get(2).select("ul").get(0)
-                        .select("li").get(0).text().replace("已用空間", "").replace("B", "").trim();
-                String mileage = profileDocument.select("div.u_profile").select("div.cl").get(2).select("ul").get(0)
-                        .select("li").get(1).text().replace("里程", "").trim();
-                String money = profileDocument.select("div.u_profile").select("div.cl").get(2).select("ul").get(0)
-                        .select("li").last().text().replace("金錢", "").trim();
 
                 CrawlerJavbusProfile crawlerJavbusProfile = new CrawlerJavbusProfile();
                 crawlerJavbusProfile.setUid(Long.valueOf(uid));
                 crawlerJavbusProfile.setNickName(nickName);
-                crawlerJavbusProfile.setEmailStatus(emailStatus);
-                crawlerJavbusProfile.setFriendNum(Integer.valueOf(friendNum));
-                crawlerJavbusProfile.setReplyNum(Integer.valueOf(replyNum));
-                crawlerJavbusProfile.setThreadNum(Integer.valueOf(threadNum));
-                crawlerJavbusProfile.setUserGroup(userGroup);
-                crawlerJavbusProfile.setOnlineTime(StringUtils.isNotEmpty(onlineTime) ? Integer.valueOf(onlineTime) : null);
-                crawlerJavbusProfile.setRegistrationTime(StringUtils.isNotEmpty(registrationTime) && registrationTime.length() >= 10 ? LocalDateTime.parse(registrationTime, DateUtils.dateTimeFormatter3) : null);
-                crawlerJavbusProfile.setLastVisit(StringUtils.isNotEmpty(lastVisit) && lastVisit.length() >= 10 ? LocalDateTime.parse(lastVisit, DateUtils.dateTimeFormatter3) : null);
-                crawlerJavbusProfile.setLastActivityTime(StringUtils.isNotEmpty(lastActivityTime) && lastActivityTime.length() >= 10 ? LocalDateTime.parse(lastActivityTime, DateUtils.dateTimeFormatter3) : null);
-                crawlerJavbusProfile.setLastPublishedTime(StringUtils.isNotEmpty(lastPublishedTime) && lastPublishedTime.length() >= 10 ? LocalDateTime.parse(lastPublishedTime, DateUtils.dateTimeFormatter3) : null);
-                crawlerJavbusProfile.setTimeZone(StringUtils.isNotEmpty(timeZone) ? timeZone : null);
-                crawlerJavbusProfile.setUsedSpace(Integer.valueOf(usedSpace));
-                crawlerJavbusProfile.setMileage(Integer.valueOf(mileage));
-                crawlerJavbusProfile.setMoney(Integer.valueOf(money));
-                crawlerJavbusProfile.setAvatarUrl(avatarUrl);
-                crawlerJavbusProfile.setSignStr(signStr);
-                crawlerJavbusProfile.setSignImg(org.apache.commons.lang3.StringUtils.join(signImgList, ","));
+                parseJavbusProfile(profileDocument, crawlerJavbusProfile);
                 crawlerJavbusProfileMapper.insertOrUpdate(crawlerJavbusProfile);
                 log.warn("jsoupJavbusProfileSub成功插入,uid={}", uid);
             } catch (Exception e) {
                 log.error("jsoupJavbusProfileSub插入异常,nickName={},uid={}", nickName, uid, e);
                 CrawlerJavbusLog crawlerJavbusLog = new CrawlerJavbusLog();
                 crawlerJavbusLog.setType(1);
+                crawlerJavbusLog.setStatus(1);
                 crawlerJavbusLog.setBusinessKey(uid);
                 crawlerJavbusLog.setErrorMsg(e.getMessage());
                 crawlerJavbusProfileMapper.insertOrUpdateLog(crawlerJavbusLog);
@@ -207,6 +214,67 @@ public class Crawler4JavbusServiceImpl implements Crawler4JavbusService {
         }
     }
 
+    public void parseJavbusProfile(Document profileDocument, CrawlerJavbusProfile crawlerJavbusProfile) {
+        String avatarUrl = profileDocument.select("div.avt").select("img").attr("src");
+        String emailStatus = profileDocument.select("div.u_profile").select("div.cl").get(0).select("ul").first().text().replace("郵箱狀態", "").trim();
+
+        Elements signEles = profileDocument.select("div.u_profile").select("div.cl").get(0).select("ul").get(1).select("li:contains(個人簽名)");
+        String signStr = "";
+        ArrayList<String> signImgList = new ArrayList<>();
+        if (signEles.size() > 0) {
+            signStr = signEles.first().select("table").text();
+            Elements signImgEles = signEles.first().select("table").select("img");
+            for (Element signImgEle : signImgEles) {
+                signImgList.add(signImgEle.attr("src"));
+            }
+        }
+
+        String friendNum = profileDocument.select("div.u_profile").select("div.cl").get(0).select("ul").get(2)
+                .select("a").get(0).text().replace("好友數", "").trim();
+        String replyNum = profileDocument.select("div.u_profile").select("div.cl").get(0).select("ul").get(2)
+                .select("a").get(1).text().replace("回帖數", "").trim();
+        String threadNum = profileDocument.select("div.u_profile").select("div.cl").get(0).select("ul").get(2)
+                .select("a").get(2).text().replace("主題數", "").trim();
+        String userGroup = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(0)
+                .select("a").text();
+        String onlineTime = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
+                .select("li:contains(在線時間)").text().replace("在線時間", "").replace("小時", "").trim();
+        String registrationTime = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
+                .select("li:contains(註冊時間)").text().replace("註冊時間", "").trim();
+        String lastVisit = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
+                .select("li:contains(最後訪問)").text().replace("最後訪問", "").trim();
+        String lastActivityTime = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
+                .select("li:contains(上次活動時間)").text().replace("上次活動時間", "").trim();
+        String lastPublishedTime = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
+                .select("li:contains(上次發表時間)").text().replace("上次發表時間", "").trim();
+        String timeZone = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
+                .select("li:contains(所在時區)").text().replace("所在時區", "").trim();
+        String usedSpace = profileDocument.select("div.u_profile").select("div.cl").get(2).select("ul").get(0)
+                .select("li").get(0).text().replace("已用空間", "").replace("B", "").trim();
+        String mileage = profileDocument.select("div.u_profile").select("div.cl").get(2).select("ul").get(0)
+                .select("li").get(1).text().replace("里程", "").trim();
+        String money = profileDocument.select("div.u_profile").select("div.cl").get(2).select("ul").get(0)
+                .select("li").last().text().replace("金錢", "").trim();
+
+        crawlerJavbusProfile.setEmailStatus(emailStatus);
+        crawlerJavbusProfile.setFriendNum(Integer.valueOf(friendNum));
+        crawlerJavbusProfile.setReplyNum(Integer.valueOf(replyNum));
+        crawlerJavbusProfile.setThreadNum(Integer.valueOf(threadNum));
+        crawlerJavbusProfile.setUserGroup(userGroup);
+        crawlerJavbusProfile.setOnlineTime(StringUtils.isNotEmpty(onlineTime) ? Integer.valueOf(onlineTime) : null);
+        crawlerJavbusProfile.setRegistrationTime(StringUtils.isNotEmpty(registrationTime) && registrationTime.length() >= 10 ? LocalDateTime.parse(registrationTime, DateUtils.dateTimeFormatter3) : null);
+        crawlerJavbusProfile.setLastVisit(StringUtils.isNotEmpty(lastVisit) && lastVisit.length() >= 10 ? LocalDateTime.parse(lastVisit, DateUtils.dateTimeFormatter3) : null);
+        crawlerJavbusProfile.setLastActivityTime(StringUtils.isNotEmpty(lastActivityTime) && lastActivityTime.length() >= 10 ? LocalDateTime.parse(lastActivityTime, DateUtils.dateTimeFormatter3) : null);
+        crawlerJavbusProfile.setLastPublishedTime(StringUtils.isNotEmpty(lastPublishedTime) && lastPublishedTime.length() >= 10 ? LocalDateTime.parse(lastPublishedTime, DateUtils.dateTimeFormatter3) : null);
+        crawlerJavbusProfile.setTimeZone(StringUtils.isNotEmpty(timeZone) ? timeZone : null);
+        crawlerJavbusProfile.setUsedSpace(Integer.valueOf(usedSpace));
+        crawlerJavbusProfile.setMileage(Integer.valueOf(mileage));
+        crawlerJavbusProfile.setMoney(Integer.valueOf(money));
+        crawlerJavbusProfile.setAvatarUrl(avatarUrl);
+        crawlerJavbusProfile.setSignStr(signStr);
+        crawlerJavbusProfile.setSignImg(org.apache.commons.lang3.StringUtils.join(signImgList, ","));
+    }
+
     private boolean generateJavbusCookies(Proxy proxy) throws Exception {
         // 1 登陆获取cookies
         // 1.0 https://www.javbus.com/forum/forum.php

+ 15 - 2
src/main/resources/static/crawler.html

@@ -167,9 +167,11 @@
             <span>Password</span>
             <input type="password" name="password" placeholder="Facebook密码" style="width: 100px;"/>
             <span>Group Id / URL</span>
-            <input type="text" name="url" placeholder="群组的唯一标示或者主页地址,多个群组用英文分号隔开" style="width: 350px;" />
+            <input type="text" name="url" placeholder="群组的唯一标示或者主页地址,多个群组用英文分号隔开"
+                   style="width: 350px;"/>
             <span>Limit</span>
-            <input type="text" name="limit" placeholder="限制爬取群组中的成员数,不填或者小于等于0代表无限制" style="width: 350px;"/>
+            <input type="text" name="limit" placeholder="限制爬取群组中的成员数,不填或者小于等于0代表无限制"
+                   style="width: 350px;"/>
             <input type="submit" value="提交">
         </form>
     </div>
@@ -184,6 +186,17 @@
             <input type="submit" value="提交">
         </form>
     </div>
+    <div style="margin-right:20px;">
+        <span class="font">handleJavbusLog</span>
+        <form method="post" action="bg/crawler/handleJavbusLog">
+            <span>status</span>
+            <select name="status" style="height: 21.43px;">
+                <option value="1">待解决</option>
+                <option value="3">未解决</option>
+            </select>
+            <input type="submit" value="提交">
+        </form>
+    </div>
 </div>
 </body>
 </html>