|
|
@@ -109,7 +109,7 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
@Async
|
|
|
@Override
|
|
|
@Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
|
|
|
- public void jsoupLoveFoot4CrawingFail(Integer status, Integer ignoreRetryCount) {
|
|
|
+ public void jsoupLoveFoot4CrawingFail(Integer status, Integer ignoreRetryCount, String website) {
|
|
|
log.warn("jjsoupLoveFoot4CrawingFail 开始");
|
|
|
StopWatch stopWatch = new StopWatch();
|
|
|
stopWatch.start();
|
|
|
@@ -135,66 +135,173 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
javdbConstantMap = dicCodeList.stream()
|
|
|
.filter(x -> x.getType() != null && 2 == x.getType() && x.getEnv().contains(env))
|
|
|
.collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
|
|
|
+
|
|
|
+ // 获取javbus防屏蔽地址
|
|
|
+ if ("javbus".equals(website)) {
|
|
|
+ javbusUrlList = videoSitePoolMapper.findUrlByTypeAndDeleteFlag(1, 1);
|
|
|
+ if (javbusUrlList.size() == 0) {
|
|
|
+ log.warn("javbusUrlList为空");
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
// 代理及TOKEN设置
|
|
|
beforeProxy();
|
|
|
// 解析原始站点
|
|
|
|
|
|
- jsoupLoveFoot4CrawingFailSub(loveFootList);
|
|
|
- log.warn("jjsoupLoveFoot4CrawingFail 结束:time={}", stopWatch.getTotalTimeSeconds());
|
|
|
+ int successCount = jsoupLoveFoot4CrawingFailSub(loveFootList, website);
|
|
|
+ log.warn("jjsoupLoveFoot4CrawingFail 结束:totalCount={},successCount={},time={}", loveFootList.size(), successCount, stopWatch.getTotalTimeSeconds());
|
|
|
}
|
|
|
|
|
|
@Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
|
|
|
- public void jsoupLoveFoot4CrawingFailSub(List<CrawlerLoveFoot> loveFootList) {
|
|
|
- Document javdbSearchDocument;
|
|
|
- Document javdbCodeDocument;
|
|
|
+ public int jsoupLoveFoot4CrawingFailSub(List<CrawlerLoveFoot> loveFootList, String website) {
|
|
|
+ int successCount = 0;
|
|
|
for (CrawlerLoveFoot crawlerLoveFoot : loveFootList) {
|
|
|
+ Document searchDocument = null;
|
|
|
+ Document codeDocument;
|
|
|
String message = null;
|
|
|
int retryCount = 0;
|
|
|
while (retryCount <= 3) {
|
|
|
long start = System.currentTimeMillis();
|
|
|
- String javdbSearchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(crawlerLoveFoot.getName()).concat("&f=all");
|
|
|
- header3Map.put("referer", javdbSearchUrl);
|
|
|
+ String searchUrl = null;
|
|
|
+ Elements itembSelects = null;
|
|
|
try {
|
|
|
- javdbSearchDocument = JsoupUtil.requestDocument(javdbSearchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
|
|
|
-
|
|
|
- Elements itembSelects = javdbSearchDocument.select("div.movie-list").select("div.item");
|
|
|
- if (itembSelects.size() == 0) {
|
|
|
- String newName = crawlerLoveFoot.getName().substring(crawlerLoveFoot.getName().length() / 2);
|
|
|
- javdbSearchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(newName).concat("&f=all");
|
|
|
- javdbSearchDocument = JsoupUtil.requestDocument(javdbSearchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
|
|
|
- itembSelects = javdbSearchDocument.select("div.movie-list").select("div.item");
|
|
|
- }
|
|
|
- if (itembSelects.size() == 0) {
|
|
|
- throw new BusinessException(30000, "javdb search result null");
|
|
|
+ if ("javbus".equals(website)) {
|
|
|
+ String javbusUrl = javbusUrlList.get((int) (0 + Math.random() * (javbusUrlList.size())));
|
|
|
+ searchUrl = javbusUrl.concat("/search/").concat(crawlerLoveFoot.getName()).concat("&parent=ce");
|
|
|
+ try {
|
|
|
+ searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
|
|
|
+ } catch (Exception ee) {
|
|
|
+ String newName = crawlerLoveFoot.getName().substring(crawlerLoveFoot.getName().length() / 2);
|
|
|
+ searchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
|
|
|
+
|
|
|
+ try {
|
|
|
+ searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
|
|
|
+ } catch (Exception eee) {
|
|
|
+ newName = newName.substring(newName.length() / 2);
|
|
|
+ searchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
|
|
|
+
|
|
|
+ try {
|
|
|
+ searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
|
|
|
+ } catch (Exception eeee) {
|
|
|
+ // throw new BusinessException(30000, "javbus search result null");
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if (null == searchDocument) {
|
|
|
+ String newName = crawlerLoveFoot.getName().replace("●", "");
|
|
|
+ searchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
|
|
|
+ try {
|
|
|
+ searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
|
|
|
+ } catch (Exception ee) {
|
|
|
+ newName = newName.substring(0, newName.length() / 2);
|
|
|
+ searchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
|
|
|
+ try {
|
|
|
+ searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
|
|
|
+ } catch (Exception eee) {
|
|
|
+ newName = newName.substring(0, newName.length() / 2);
|
|
|
+ searchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
|
|
|
+ try {
|
|
|
+ searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
|
|
|
+ } catch (Exception eeee) {
|
|
|
+ throw new BusinessException(30000, "javbus search result null");
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ itembSelects = searchDocument.select("div#waterfall").select("div.item");
|
|
|
+
|
|
|
+ if (itembSelects.size() == 0) {
|
|
|
+ throw new BusinessException(30000, "javbus search result null");
|
|
|
+ }
|
|
|
+ } else if ("javdb".equals(website)) {
|
|
|
+ searchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(crawlerLoveFoot.getName()).concat("&f=all");
|
|
|
+ header3Map.put("referer", searchUrl);
|
|
|
+
|
|
|
+ searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
|
|
|
+
|
|
|
+ itembSelects = searchDocument.select("div.movie-list").select("div.item");
|
|
|
+ if (itembSelects.size() == 0) {
|
|
|
+ String newName = crawlerLoveFoot.getName().substring(crawlerLoveFoot.getName().length() / 2);
|
|
|
+ searchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(newName).concat("&f=all");
|
|
|
+ searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
|
|
|
+ itembSelects = searchDocument.select("div.movie-list").select("div.item");
|
|
|
+ }
|
|
|
+
|
|
|
+ if (itembSelects.size() == 0) {
|
|
|
+ throw new BusinessException(30000, "javdb search result null");
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
// 获取codeUrl
|
|
|
String codeUrl = null;
|
|
|
String title;
|
|
|
- for (Element itembSelect : itembSelects) {
|
|
|
- title = itembSelect.select("a.box").get(0).attr("title");
|
|
|
- if (title.contains(crawlerLoveFoot.getName())) {
|
|
|
- codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
|
|
|
- break;
|
|
|
- }
|
|
|
|
|
|
- String newName = crawlerLoveFoot.getName().replace("●", "さ");
|
|
|
- if (title.contains(newName)) {
|
|
|
- codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
|
|
|
- crawlerLoveFoot.setName(newName);
|
|
|
- break;
|
|
|
+ if ("javbus".equals(website)) {
|
|
|
+ for (Element itembSelect : itembSelects) {
|
|
|
+ title = itembSelect.select("a.movie-box").get(0).select("div.photo-frame > img").attr("title");
|
|
|
+ if (title.contains(crawlerLoveFoot.getName())) {
|
|
|
+ codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ String newName = crawlerLoveFoot.getName().replace("●", "さ");
|
|
|
+ if (title.contains(newName)) {
|
|
|
+ codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
|
|
|
+ crawlerLoveFoot.setName(newName);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ String[] newNameArr = crawlerLoveFoot.getName().split("●");
|
|
|
+ int matchCount = 0;
|
|
|
+ for (String s : newNameArr) {
|
|
|
+ if (title.contains(s)) {
|
|
|
+ matchCount++;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (newNameArr.length == matchCount) {
|
|
|
+ codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
|
|
|
+ crawlerLoveFoot.setName(title);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (StringUtils.isEmpty(codeUrl)) {
|
|
|
+ throw new BusinessException(30000, "javbus search result mismatch");
|
|
|
+ }
|
|
|
+ } else if ("javdb".equals(website)) {
|
|
|
+ for (Element itembSelect : itembSelects) {
|
|
|
+ title = itembSelect.select("a.box").get(0).attr("title");
|
|
|
+ if (title.contains(crawlerLoveFoot.getName())) {
|
|
|
+ codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ String newName = crawlerLoveFoot.getName().replace("●", "さ");
|
|
|
+ if (title.contains(newName)) {
|
|
|
+ codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
|
|
|
+ crawlerLoveFoot.setName(newName);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (StringUtils.isEmpty(codeUrl)) {
|
|
|
+ throw new BusinessException(30000, "javdb search result mismatch");
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- if (StringUtils.isEmpty(codeUrl)) {
|
|
|
- throw new BusinessException(30000, "javdb search result mismatch");
|
|
|
- }
|
|
|
|
|
|
// 解析codeUrl
|
|
|
- javdbCodeDocument = JsoupUtil.requestDocument(codeUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
|
|
|
- long picTime = parseJavdbCodeDocument(javdbCodeDocument, crawlerLoveFoot);
|
|
|
+ long picTime = 999;
|
|
|
+ if ("javbus".equals(website)) {
|
|
|
+ codeDocument = JsoupUtil.requestDocument(codeUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
|
|
|
+ picTime = parseJavbusCodeDocument(codeDocument, crawlerLoveFoot);
|
|
|
+ } else if ("javdb".equals(website)) {
|
|
|
+ codeDocument = JsoupUtil.requestDocument(codeUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
|
|
|
+ picTime = parseJavdbCodeDocument(codeDocument, crawlerLoveFoot);
|
|
|
+ crawlerLoveFoot.setJavdbUrl(codeUrl);
|
|
|
+ }
|
|
|
|
|
|
- crawlerLoveFoot.setJavdbUrl(codeUrl);
|
|
|
crawlerLoveFoot.setRetryCount(retryCount);
|
|
|
crawlerLoveFoot.setType(2);
|
|
|
crawlerLoveFoot.setStatus(3);
|
|
|
@@ -205,7 +312,7 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
++retryCount;
|
|
|
|
|
|
if (retryCount < 4) {
|
|
|
- log.error("jsoupLoveFoot4CrawingFailSub error重试:,retryCount={},time={},javdbSearchUrl={}", retryCount, System.currentTimeMillis() - start, javdbSearchUrl, e);
|
|
|
+ log.error("jsoupLoveFoot4CrawingFailSub error重试:,retryCount={},time={},javdbSearchUrl={}", retryCount, System.currentTimeMillis() - start, searchUrl, e);
|
|
|
} else if (retryCount == 4) {
|
|
|
message = e.getMessage().length() <= 200 ? e.getMessage() : e.getMessage().substring(0, 200);
|
|
|
}
|
|
|
@@ -226,9 +333,10 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
} else {
|
|
|
crawlerLoveFoot.setFailureCause("");
|
|
|
crawlerLoveFootMapper.updateInfoById(crawlerLoveFoot);
|
|
|
+ successCount++;
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
+ return successCount;
|
|
|
}
|
|
|
|
|
|
private long parseJavdbCodeDocument(Document javdbCodeDocument, CrawlerLoveFoot crawlerLoveFoot) throws IOException {
|
|
|
@@ -376,6 +484,7 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
crawlerLoveFoot.setOrginUrl(sourceUrl);
|
|
|
crawlerLoveFoot.setType(2);
|
|
|
crawlerLoveFoot.setStatus(3);
|
|
|
+ crawlerLoveFoot.setCreateTime(LocalDateTime.now());
|
|
|
String message = parseKeywordsToCode(crawlerLoveFoot, keywords);
|
|
|
if (StringUtils.isNotEmpty(message)) {
|
|
|
statusInt = 4;
|
|
|
@@ -411,15 +520,53 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
|
|
|
private String parseKeywordsToCode(CrawlerLoveFoot crawlerLoveFoot, String keywords) {
|
|
|
int retryCount = 0;
|
|
|
- Document javbusSearchDocument;
|
|
|
+ Document javbusSearchDocument = null;
|
|
|
Document javbusCodeDocument;
|
|
|
String message = null;
|
|
|
while (retryCount <= 3) {
|
|
|
long start = System.currentTimeMillis();
|
|
|
- String javbusUrl = javbusUrlList.get((int) (0 + Math.random() * (javbusUrlList.size())));
|
|
|
- String javbusSearchUrl = javbusUrl.concat("/search/").concat(keywords).concat("&parent=ce");
|
|
|
try {
|
|
|
- javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
|
|
|
+ String javbusUrl = javbusUrlList.get((int) (0 + Math.random() * (javbusUrlList.size())));
|
|
|
+ String javbusSearchUrl = javbusUrl.concat("/search/").concat(keywords).concat("&parent=ce");
|
|
|
+ try {
|
|
|
+ javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
|
|
|
+ } catch (Exception ee) {
|
|
|
+ String newName = keywords.substring(keywords.length() / 2);
|
|
|
+ javbusSearchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
|
|
|
+ try {
|
|
|
+ javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
|
|
|
+ } catch (Exception eee) {
|
|
|
+ newName = newName.substring(newName.length() / 2);
|
|
|
+ javbusSearchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
|
|
|
+ try {
|
|
|
+ javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
|
|
|
+ } catch (Exception eeee) {
|
|
|
+ // throw new BusinessException(30000, "javbus search result null");
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if (null == javbusSearchDocument) {
|
|
|
+ String newName = keywords.replace("●", "");
|
|
|
+ javbusSearchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
|
|
|
+ try {
|
|
|
+ javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
|
|
|
+ } catch (Exception ee) {
|
|
|
+ newName = newName.substring(0, newName.length() / 2);
|
|
|
+ javbusSearchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
|
|
|
+ try {
|
|
|
+ javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
|
|
|
+ } catch (Exception eee) {
|
|
|
+ newName = newName.substring(0, newName.length() / 2);
|
|
|
+ javbusSearchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
|
|
|
+ try {
|
|
|
+ javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
|
|
|
+ } catch (Exception eeee) {
|
|
|
+ throw new BusinessException(30000, "javbus search result null");
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
|
|
|
Elements itembSelects = javbusSearchDocument.select("div#waterfall").select("div.item");
|
|
|
if (itembSelects.size() == 0) {
|
|
|
@@ -427,7 +574,40 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
}
|
|
|
|
|
|
// 获取codeUrl
|
|
|
- String codeUrl = itembSelects.select("a.movie-box").get(0).attr("abs:href");
|
|
|
+ String codeUrl = null;
|
|
|
+ String title;
|
|
|
+ for (Element itembSelect : itembSelects) {
|
|
|
+ title = itembSelect.select("a.movie-box").get(0).select("div.photo-frame > img").attr("title");
|
|
|
+ if (title.contains(keywords)) {
|
|
|
+ codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ String newName = keywords.replace("●", "さ");
|
|
|
+ if (title.contains(newName)) {
|
|
|
+ codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
|
|
|
+ crawlerLoveFoot.setName(newName);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ String[] newNameArr = keywords.split("●");
|
|
|
+ int matchCount = 0;
|
|
|
+ for (String s : newNameArr) {
|
|
|
+ if (title.contains(s)) {
|
|
|
+ matchCount++;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (newNameArr.length == matchCount) {
|
|
|
+ codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
|
|
|
+ crawlerLoveFoot.setName(title);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if (StringUtils.isEmpty(codeUrl)) {
|
|
|
+ throw new BusinessException(30000, "javbus search result mismatch");
|
|
|
+ }
|
|
|
+
|
|
|
// 解析codeUrl
|
|
|
javbusCodeDocument = JsoupUtil.requestDocument(codeUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
|
|
|
long picTime = parseJavbusCodeDocument(javbusCodeDocument, crawlerLoveFoot);
|
|
|
@@ -550,7 +730,6 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
long end = System.currentTimeMillis();
|
|
|
|
|
|
crawlerLoveFoot.setImgUrl(machiImgUrl);
|
|
|
- crawlerLoveFoot.setCreateTime(LocalDateTime.now());
|
|
|
|
|
|
return end - start;
|
|
|
}
|
|
|
@@ -578,8 +757,10 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
}
|
|
|
|
|
|
public static void main(String[] args) {
|
|
|
- String s = "嫉妬に狂った愛人のエグい杭打ちピストンにどハマり…都合の良いオンナのはずが快楽沼へ引きずり込まれた僕 七ツ森りり";
|
|
|
+ String s = "リア充反対!彼女の目の前で彼氏を拘束、●す鬼畜痴女";
|
|
|
String newName = s.substring(s.length() / 2);
|
|
|
+
|
|
|
+ newName = newName.substring(newName.length() / 2);
|
|
|
System.out.println(newName);
|
|
|
}
|
|
|
}
|