Kaynağa Gözat

update:优化根据足关键词解析code v1

tujidelv 3 yıl önce
ebeveyn
işleme
d723a20c18

+ 2 - 2
src/main/java/top/lvzhiqiang/config/MyJobs.java

@@ -204,10 +204,10 @@ public class MyJobs {
     /**
      * 每天23:55 jsoupLoveFoot4CrawingFail
      */
-    // @Scheduled(cron = "0 55 23 * * ?", zone = SCHEDULED_ZONE)
+    @Scheduled(cron = "0 55 23 * * ?", zone = SCHEDULED_ZONE)
     public void jsoupLoveFoot4CrawingFail() {
         log.warn("jsoupLoveFoot4CrawingFail开始==============================");
 
-        crawler4LoveFootService.jsoupLoveFoot4CrawingFail(4, 2);
+        crawler4LoveFootService.jsoupLoveFoot4CrawingFail(4, 2, "javbus");
     }
 }

+ 2 - 2
src/main/java/top/lvzhiqiang/controller/CrawlerController.java

@@ -238,7 +238,7 @@ public class CrawlerController {
      */
     @RequestMapping("/jsoupLoveFoot")
     @ResponseBody
-    public R jsoupLoveFoot(Integer status, Integer isDel, Integer ignoreRetryCount) throws Exception {
+    public R jsoupLoveFoot(Integer status, Integer isDel, Integer ignoreRetryCount, String website) throws Exception {
         if (null == isDel) {
             isDel = 2;
         }
@@ -247,7 +247,7 @@ public class CrawlerController {
         }
 
         if (4 == status) {
-            crawler4LoveFootService.jsoupLoveFoot4CrawingFail(status, ignoreRetryCount);
+            crawler4LoveFootService.jsoupLoveFoot4CrawingFail(status, ignoreRetryCount, website);
         } else {
             crawler4LoveFootService.jsoupLoveFoot4avnoashi(status, isDel, ignoreRetryCount);
         }

+ 1 - 1
src/main/java/top/lvzhiqiang/service/Crawler4LoveFootService.java

@@ -10,5 +10,5 @@ public interface Crawler4LoveFootService {
 
     void jsoupLoveFoot4avnoashi(Integer status, Integer isDel, Integer ignoreRetryCount) throws Exception;
 
-    void jsoupLoveFoot4CrawingFail(Integer status, Integer ignoreRetryCount);
+    void jsoupLoveFoot4CrawingFail(Integer status, Integer ignoreRetryCount, String website);
 }

+ 226 - 45
src/main/java/top/lvzhiqiang/service/impl/Crawler4LoveFootServiceImpl.java

@@ -109,7 +109,7 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
     @Async
     @Override
     @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
-    public void jsoupLoveFoot4CrawingFail(Integer status, Integer ignoreRetryCount) {
+    public void jsoupLoveFoot4CrawingFail(Integer status, Integer ignoreRetryCount, String website) {
         log.warn("jjsoupLoveFoot4CrawingFail 开始");
         StopWatch stopWatch = new StopWatch();
         stopWatch.start();
@@ -135,66 +135,173 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
         javdbConstantMap = dicCodeList.stream()
                 .filter(x -> x.getType() != null && 2 == x.getType() && x.getEnv().contains(env))
                 .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
+
+        // 获取javbus防屏蔽地址
+        if ("javbus".equals(website)) {
+            javbusUrlList = videoSitePoolMapper.findUrlByTypeAndDeleteFlag(1, 1);
+            if (javbusUrlList.size() == 0) {
+                log.warn("javbusUrlList为空");
+                return;
+            }
+        }
+
         // 代理及TOKEN设置
         beforeProxy();
         // 解析原始站点
 
-        jsoupLoveFoot4CrawingFailSub(loveFootList);
-        log.warn("jjsoupLoveFoot4CrawingFail 结束:time={}", stopWatch.getTotalTimeSeconds());
+        int successCount = jsoupLoveFoot4CrawingFailSub(loveFootList, website);
+        log.warn("jjsoupLoveFoot4CrawingFail 结束:totalCount={},successCount={},time={}", loveFootList.size(), successCount, stopWatch.getTotalTimeSeconds());
     }
 
     @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
-    public void jsoupLoveFoot4CrawingFailSub(List<CrawlerLoveFoot> loveFootList) {
-        Document javdbSearchDocument;
-        Document javdbCodeDocument;
+    public int jsoupLoveFoot4CrawingFailSub(List<CrawlerLoveFoot> loveFootList, String website) {
+        int successCount = 0;
         for (CrawlerLoveFoot crawlerLoveFoot : loveFootList) {
+            Document searchDocument = null;
+            Document codeDocument;
             String message = null;
             int retryCount = 0;
             while (retryCount <= 3) {
                 long start = System.currentTimeMillis();
-                String javdbSearchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(crawlerLoveFoot.getName()).concat("&f=all");
-                header3Map.put("referer", javdbSearchUrl);
+                String searchUrl = null;
+                Elements itembSelects = null;
                 try {
-                    javdbSearchDocument = JsoupUtil.requestDocument(javdbSearchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
-
-                    Elements itembSelects = javdbSearchDocument.select("div.movie-list").select("div.item");
-                    if (itembSelects.size() == 0) {
-                        String newName = crawlerLoveFoot.getName().substring(crawlerLoveFoot.getName().length() / 2);
-                        javdbSearchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(newName).concat("&f=all");
-                        javdbSearchDocument = JsoupUtil.requestDocument(javdbSearchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
-                        itembSelects = javdbSearchDocument.select("div.movie-list").select("div.item");
-                    }
-                    if (itembSelects.size() == 0) {
-                        throw new BusinessException(30000, "javdb search result null");
+                    if ("javbus".equals(website)) {
+                        String javbusUrl = javbusUrlList.get((int) (0 + Math.random() * (javbusUrlList.size())));
+                        searchUrl = javbusUrl.concat("/search/").concat(crawlerLoveFoot.getName()).concat("&parent=ce");
+                        try {
+                            searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
+                        } catch (Exception ee) {
+                            String newName = crawlerLoveFoot.getName().substring(crawlerLoveFoot.getName().length() / 2);
+                            searchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
+
+                            try {
+                                searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
+                            } catch (Exception eee) {
+                                newName = newName.substring(newName.length() / 2);
+                                searchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
+
+                                try {
+                                    searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
+                                } catch (Exception eeee) {
+                                    // throw new BusinessException(30000, "javbus search result null");
+                                }
+                            }
+                        }
+
+                        if (null == searchDocument) {
+                            String newName = crawlerLoveFoot.getName().replace("●", "");
+                            searchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
+                            try {
+                                searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
+                            } catch (Exception ee) {
+                                newName = newName.substring(0, newName.length() / 2);
+                                searchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
+                                try {
+                                    searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
+                                } catch (Exception eee) {
+                                    newName = newName.substring(0, newName.length() / 2);
+                                    searchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
+                                    try {
+                                        searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
+                                    } catch (Exception eeee) {
+                                        throw new BusinessException(30000, "javbus search result null");
+                                    }
+                                }
+                            }
+                        }
+
+                        itembSelects = searchDocument.select("div#waterfall").select("div.item");
+
+                        if (itembSelects.size() == 0) {
+                            throw new BusinessException(30000, "javbus search result null");
+                        }
+                    } else if ("javdb".equals(website)) {
+                        searchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(crawlerLoveFoot.getName()).concat("&f=all");
+                        header3Map.put("referer", searchUrl);
+
+                        searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
+
+                        itembSelects = searchDocument.select("div.movie-list").select("div.item");
+                        if (itembSelects.size() == 0) {
+                            String newName = crawlerLoveFoot.getName().substring(crawlerLoveFoot.getName().length() / 2);
+                            searchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(newName).concat("&f=all");
+                            searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
+                            itembSelects = searchDocument.select("div.movie-list").select("div.item");
+                        }
+
+                        if (itembSelects.size() == 0) {
+                            throw new BusinessException(30000, "javdb search result null");
+                        }
                     }
 
                     // 获取codeUrl
                     String codeUrl = null;
                     String title;
-                    for (Element itembSelect : itembSelects) {
-                        title = itembSelect.select("a.box").get(0).attr("title");
-                        if (title.contains(crawlerLoveFoot.getName())) {
-                            codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
-                            break;
-                        }
 
-                        String newName = crawlerLoveFoot.getName().replace("●", "さ");
-                        if (title.contains(newName)) {
-                            codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
-                            crawlerLoveFoot.setName(newName);
-                            break;
+                    if ("javbus".equals(website)) {
+                        for (Element itembSelect : itembSelects) {
+                            title = itembSelect.select("a.movie-box").get(0).select("div.photo-frame > img").attr("title");
+                            if (title.contains(crawlerLoveFoot.getName())) {
+                                codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
+                                break;
+                            }
+
+                            String newName = crawlerLoveFoot.getName().replace("●", "さ");
+                            if (title.contains(newName)) {
+                                codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
+                                crawlerLoveFoot.setName(newName);
+                                break;
+                            }
+
+                            String[] newNameArr = crawlerLoveFoot.getName().split("●");
+                            int matchCount = 0;
+                            for (String s : newNameArr) {
+                                if (title.contains(s)) {
+                                    matchCount++;
+                                }
+                            }
+                            if (newNameArr.length == matchCount) {
+                                codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
+                                crawlerLoveFoot.setName(title);
+                                break;
+                            }
+                        }
+                        if (StringUtils.isEmpty(codeUrl)) {
+                            throw new BusinessException(30000, "javbus search result mismatch");
+                        }
+                    } else if ("javdb".equals(website)) {
+                        for (Element itembSelect : itembSelects) {
+                            title = itembSelect.select("a.box").get(0).attr("title");
+                            if (title.contains(crawlerLoveFoot.getName())) {
+                                codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
+                                break;
+                            }
+
+                            String newName = crawlerLoveFoot.getName().replace("●", "さ");
+                            if (title.contains(newName)) {
+                                codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
+                                crawlerLoveFoot.setName(newName);
+                                break;
+                            }
+                        }
+                        if (StringUtils.isEmpty(codeUrl)) {
+                            throw new BusinessException(30000, "javdb search result mismatch");
                         }
                     }
 
-                    if (StringUtils.isEmpty(codeUrl)) {
-                        throw new BusinessException(30000, "javdb search result mismatch");
-                    }
 
                     // 解析codeUrl
-                    javdbCodeDocument = JsoupUtil.requestDocument(codeUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
-                    long picTime = parseJavdbCodeDocument(javdbCodeDocument, crawlerLoveFoot);
+                    long picTime = 999;
+                    if ("javbus".equals(website)) {
+                        codeDocument = JsoupUtil.requestDocument(codeUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
+                        picTime = parseJavbusCodeDocument(codeDocument, crawlerLoveFoot);
+                    } else if ("javdb".equals(website)) {
+                        codeDocument = JsoupUtil.requestDocument(codeUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
+                        picTime = parseJavdbCodeDocument(codeDocument, crawlerLoveFoot);
+                        crawlerLoveFoot.setJavdbUrl(codeUrl);
+                    }
 
-                    crawlerLoveFoot.setJavdbUrl(codeUrl);
                     crawlerLoveFoot.setRetryCount(retryCount);
                     crawlerLoveFoot.setType(2);
                     crawlerLoveFoot.setStatus(3);
@@ -205,7 +312,7 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
                     ++retryCount;
 
                     if (retryCount < 4) {
-                        log.error("jsoupLoveFoot4CrawingFailSub error重试:,retryCount={},time={},javdbSearchUrl={}", retryCount, System.currentTimeMillis() - start, javdbSearchUrl, e);
+                        log.error("jsoupLoveFoot4CrawingFailSub error重试:,retryCount={},time={},javdbSearchUrl={}", retryCount, System.currentTimeMillis() - start, searchUrl, e);
                     } else if (retryCount == 4) {
                         message = e.getMessage().length() <= 200 ? e.getMessage() : e.getMessage().substring(0, 200);
                     }
@@ -226,9 +333,10 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
             } else {
                 crawlerLoveFoot.setFailureCause("");
                 crawlerLoveFootMapper.updateInfoById(crawlerLoveFoot);
+                successCount++;
             }
         }
-
+        return successCount;
     }
 
     private long parseJavdbCodeDocument(Document javdbCodeDocument, CrawlerLoveFoot crawlerLoveFoot) throws IOException {
@@ -376,6 +484,7 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
                     crawlerLoveFoot.setOrginUrl(sourceUrl);
                     crawlerLoveFoot.setType(2);
                     crawlerLoveFoot.setStatus(3);
+                    crawlerLoveFoot.setCreateTime(LocalDateTime.now());
                     String message = parseKeywordsToCode(crawlerLoveFoot, keywords);
                     if (StringUtils.isNotEmpty(message)) {
                         statusInt = 4;
@@ -411,15 +520,53 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
 
     private String parseKeywordsToCode(CrawlerLoveFoot crawlerLoveFoot, String keywords) {
         int retryCount = 0;
-        Document javbusSearchDocument;
+        Document javbusSearchDocument = null;
         Document javbusCodeDocument;
         String message = null;
         while (retryCount <= 3) {
             long start = System.currentTimeMillis();
-            String javbusUrl = javbusUrlList.get((int) (0 + Math.random() * (javbusUrlList.size())));
-            String javbusSearchUrl = javbusUrl.concat("/search/").concat(keywords).concat("&parent=ce");
             try {
-                javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
+                String javbusUrl = javbusUrlList.get((int) (0 + Math.random() * (javbusUrlList.size())));
+                String javbusSearchUrl = javbusUrl.concat("/search/").concat(keywords).concat("&parent=ce");
+                try {
+                    javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
+                } catch (Exception ee) {
+                    String newName = keywords.substring(keywords.length() / 2);
+                    javbusSearchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
+                    try {
+                        javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
+                    } catch (Exception eee) {
+                        newName = newName.substring(newName.length() / 2);
+                        javbusSearchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
+                        try {
+                            javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
+                        } catch (Exception eeee) {
+                            // throw new BusinessException(30000, "javbus search result null");
+                        }
+                    }
+                }
+
+                if (null == javbusSearchDocument) {
+                    String newName = keywords.replace("●", "");
+                    javbusSearchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
+                    try {
+                        javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
+                    } catch (Exception ee) {
+                        newName = newName.substring(0, newName.length() / 2);
+                        javbusSearchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
+                        try {
+                            javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
+                        } catch (Exception eee) {
+                            newName = newName.substring(0, newName.length() / 2);
+                            javbusSearchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
+                            try {
+                                javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
+                            } catch (Exception eeee) {
+                                throw new BusinessException(30000, "javbus search result null");
+                            }
+                        }
+                    }
+                }
 
                 Elements itembSelects = javbusSearchDocument.select("div#waterfall").select("div.item");
                 if (itembSelects.size() == 0) {
@@ -427,7 +574,40 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
                 }
 
                 // 获取codeUrl
-                String codeUrl = itembSelects.select("a.movie-box").get(0).attr("abs:href");
+                String codeUrl = null;
+                String title;
+                for (Element itembSelect : itembSelects) {
+                    title = itembSelect.select("a.movie-box").get(0).select("div.photo-frame > img").attr("title");
+                    if (title.contains(keywords)) {
+                        codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
+                        break;
+                    }
+
+                    String newName = keywords.replace("●", "さ");
+                    if (title.contains(newName)) {
+                        codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
+                        crawlerLoveFoot.setName(newName);
+                        break;
+                    }
+
+                    String[] newNameArr = keywords.split("●");
+                    int matchCount = 0;
+                    for (String s : newNameArr) {
+                        if (title.contains(s)) {
+                            matchCount++;
+                        }
+                    }
+                    if (newNameArr.length == matchCount) {
+                        codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
+                        crawlerLoveFoot.setName(title);
+                        break;
+                    }
+                }
+
+                if (StringUtils.isEmpty(codeUrl)) {
+                    throw new BusinessException(30000, "javbus search result mismatch");
+                }
+
                 // 解析codeUrl
                 javbusCodeDocument = JsoupUtil.requestDocument(codeUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
                 long picTime = parseJavbusCodeDocument(javbusCodeDocument, crawlerLoveFoot);
@@ -550,7 +730,6 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
         long end = System.currentTimeMillis();
 
         crawlerLoveFoot.setImgUrl(machiImgUrl);
-        crawlerLoveFoot.setCreateTime(LocalDateTime.now());
 
         return end - start;
     }
@@ -578,8 +757,10 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
     }
 
     public static void main(String[] args) {
-        String s = "嫉妬に狂った愛人のエグい杭打ちピストンにどハマり…都合の良いオンナのはずが快楽沼へ引きずり込まれた僕 七ツ森りり";
+        String s = "リア充反対!彼女の目の前で彼氏を拘束、●す鬼畜痴女";
         String newName = s.substring(s.length() / 2);
+
+        newName = newName.substring(newName.length() / 2);
         System.out.println(newName);
     }
 }

+ 5 - 0
src/main/resources/static/crawler.html

@@ -247,6 +247,11 @@
                 <option value="4">爬取失败</option>
                 <option value="">从0开始</option>
             </select>
+            <span>website</span>
+            <select name="website" style="height: 21.43px;">
+                <option value="javdb">javdb</option>
+                <option value="javbus">javbus</option>
+            </select>
             <span>isDel</span>
             <input type="text" name="isDel" placeholder="1:是,2:否。默认否"/>
             <span>ignoreRetryCount</span>