|
@@ -68,7 +68,7 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
public void beforeProxy() {
|
|
public void beforeProxy() {
|
|
|
if (null == proxy) {
|
|
if (null == proxy) {
|
|
|
if ("dev".equals(env)) {
|
|
if ("dev".equals(env)) {
|
|
|
- proxy = new Proxy(Proxy.Type.SOCKS, new InetSocketAddress("127.0.0.1", 1080));
|
|
|
|
|
|
|
+ proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("127.0.0.1", 1080));
|
|
|
} else {
|
|
} else {
|
|
|
proxy = Proxy.NO_PROXY;
|
|
proxy = Proxy.NO_PROXY;
|
|
|
}
|
|
}
|
|
@@ -145,18 +145,24 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
@Async
|
|
@Async
|
|
|
@Override
|
|
@Override
|
|
|
@Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
|
|
@Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
|
|
|
- public void jsoupLoveFoot4CrawingFail(Integer status, Integer ignoreRetryCount, String website) {
|
|
|
|
|
|
|
+ public void jsoupLoveFoot4CrawingFail(Integer status, Integer ignoreRetryCount, String website, String identificationCode) {
|
|
|
log.warn("jjsoupLoveFoot4CrawingFail 开始");
|
|
log.warn("jjsoupLoveFoot4CrawingFail 开始");
|
|
|
StopWatch stopWatch = new StopWatch();
|
|
StopWatch stopWatch = new StopWatch();
|
|
|
stopWatch.start();
|
|
stopWatch.start();
|
|
|
|
|
|
|
|
// 获取待抓取码列表
|
|
// 获取待抓取码列表
|
|
|
List<CrawlerLoveFoot> loveFootList;
|
|
List<CrawlerLoveFoot> loveFootList;
|
|
|
- if (1 == ignoreRetryCount) {
|
|
|
|
|
- loveFootList = crawlerLoveFootMapper.findInfoByStatus4IgnoreRetryCount(status);
|
|
|
|
|
|
|
+
|
|
|
|
|
+ if (StringUtils.isNotEmpty(identificationCode)) {
|
|
|
|
|
+ loveFootList = crawlerLoveFootMapper.findByCodeAndType(identificationCode, null, null);
|
|
|
} else {
|
|
} else {
|
|
|
- loveFootList = crawlerLoveFootMapper.findInfoByStatus(status);
|
|
|
|
|
|
|
+ if (1 == ignoreRetryCount) {
|
|
|
|
|
+ loveFootList = crawlerLoveFootMapper.findInfoByStatus4IgnoreRetryCount(status);
|
|
|
|
|
+ } else {
|
|
|
|
|
+ loveFootList = crawlerLoveFootMapper.findInfoByStatus(status);
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
|
|
+
|
|
|
if (loveFootList.size() == 0) {
|
|
if (loveFootList.size() == 0) {
|
|
|
log.warn("loveFootList为空");
|
|
log.warn("loveFootList为空");
|
|
|
return;
|
|
return;
|
|
@@ -192,13 +198,27 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
@Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
|
|
@Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
|
|
|
public int jsoupLoveFoot4CrawingFailSub(List<CrawlerLoveFoot> loveFootList, String website) {
|
|
public int jsoupLoveFoot4CrawingFailSub(List<CrawlerLoveFoot> loveFootList, String website) {
|
|
|
int successCount = 0;
|
|
int successCount = 0;
|
|
|
|
|
+ Document loveFootDetailDocument;
|
|
|
for (CrawlerLoveFoot crawlerLoveFoot : loveFootList) {
|
|
for (CrawlerLoveFoot crawlerLoveFoot : loveFootList) {
|
|
|
Document searchDocument = null;
|
|
Document searchDocument = null;
|
|
|
Document codeDocument;
|
|
Document codeDocument;
|
|
|
String message = null;
|
|
String message = null;
|
|
|
int retryCount = 0;
|
|
int retryCount = 0;
|
|
|
|
|
|
|
|
- if (StringUtils.isEmpty(crawlerLoveFoot.getName())) {
|
|
|
|
|
|
|
+ if (StringUtils.isEmpty(crawlerLoveFoot.getName()) && crawlerLoveFoot.getOrginAvnoashiUrl().contains("avnoashi-1.com")) {
|
|
|
|
|
+ try {
|
|
|
|
|
+ loveFootDetailDocument = JsoupUtil.requestDocument(crawlerLoveFoot.getOrginAvnoashiUrl(), JsoupUtil.HTTP_GET, proxy, null, header2Map, null);
|
|
|
|
|
+ // 获取关键词
|
|
|
|
|
+ String keywords = loveFootDetailDocument.select("div.postContents").select("td:contains(タイトル)").next("td").text();
|
|
|
|
|
+ if (StringUtils.isNotEmpty(keywords)) {
|
|
|
|
|
+ crawlerLoveFoot.setName(keywords);
|
|
|
|
|
+ } else {
|
|
|
|
|
+ crawlerLoveFoot.setName("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab");
|
|
|
|
|
+ }
|
|
|
|
|
+ } catch (Exception e) {
|
|
|
|
|
+ crawlerLoveFoot.setName("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab");
|
|
|
|
|
+ }
|
|
|
|
|
+ } else if (StringUtils.isEmpty(crawlerLoveFoot.getName()) && crawlerLoveFoot.getOrginJpfootUrl().contains("jp-foot.net")) {
|
|
|
crawlerLoveFoot.setName("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
|
|
crawlerLoveFoot.setName("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -210,6 +230,10 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
Thread.sleep(3000);
|
|
Thread.sleep(3000);
|
|
|
|
|
|
|
|
String javbusCodeUrl = null;
|
|
String javbusCodeUrl = null;
|
|
|
|
|
+
|
|
|
|
|
+ // 获取codeUrl
|
|
|
|
|
+ String codeUrl = null;
|
|
|
|
|
+ String title;
|
|
|
if ("javbus".equals(website)) {
|
|
if ("javbus".equals(website)) {
|
|
|
String javbusUrl = javbusUrlList.get((int) (0 + Math.random() * (javbusUrlList.size())));
|
|
String javbusUrl = javbusUrlList.get((int) (0 + Math.random() * (javbusUrlList.size())));
|
|
|
searchUrl = javbusUrl.concat("/search/").concat(crawlerLoveFoot.getName()).concat("&parent=ce");
|
|
searchUrl = javbusUrl.concat("/search/").concat(crawlerLoveFoot.getName()).concat("&parent=ce");
|
|
@@ -273,28 +297,84 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
} else if ("javdb".equals(website)) {
|
|
} else if ("javdb".equals(website)) {
|
|
|
|
|
+ crawlerLoveFoot.setName(crawlerLoveFoot.getName().replace("%", "%"));
|
|
|
|
|
+
|
|
|
searchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(crawlerLoveFoot.getName()).concat("&f=all");
|
|
searchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(crawlerLoveFoot.getName()).concat("&f=all");
|
|
|
header3Map.put("referer", searchUrl);
|
|
header3Map.put("referer", searchUrl);
|
|
|
|
|
|
|
|
- searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
|
|
|
|
|
|
|
+ subsearch:
|
|
|
|
|
+ {
|
|
|
|
|
+ searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
|
|
|
|
|
+ itembSelects = searchDocument.select("div.movie-list").select("div.item");
|
|
|
|
|
+ if (itembSelects.size() != 0) {
|
|
|
|
|
+ for (Element itembSelect : itembSelects) {
|
|
|
|
|
+ title = itembSelect.select("a.box").get(0).attr("title");
|
|
|
|
|
+ if (title.contains(crawlerLoveFoot.getName())) {
|
|
|
|
|
+ codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
|
|
|
|
|
+ break subsearch;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ String newName = crawlerLoveFoot.getName().replace("●", "さ");
|
|
|
|
|
+ if (title.contains(newName)) {
|
|
|
|
|
+ codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
|
|
|
|
|
+ crawlerLoveFoot.setName(newName);
|
|
|
|
|
+ break subsearch;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ newName = crawlerLoveFoot.getName().replace("●", "這");
|
|
|
|
|
+ if (title.contains(newName)) {
|
|
|
|
|
+ codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
|
|
|
|
|
+ crawlerLoveFoot.setName(newName);
|
|
|
|
|
+ break subsearch;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ newName = crawlerLoveFoot.getName().replace("○", "〇");
|
|
|
|
|
+ if (title.contains(newName)) {
|
|
|
|
|
+ codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
|
|
|
|
|
+ crawlerLoveFoot.setName(newName);
|
|
|
|
|
+ break subsearch;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ }
|
|
|
|
|
|
|
|
- itembSelects = searchDocument.select("div.movie-list").select("div.item");
|
|
|
|
|
- if (itembSelects.size() == 0) {
|
|
|
|
|
String newName = crawlerLoveFoot.getName().substring(crawlerLoveFoot.getName().length() / 2);
|
|
String newName = crawlerLoveFoot.getName().substring(crawlerLoveFoot.getName().length() / 2);
|
|
|
searchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(newName).concat("&f=all");
|
|
searchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(newName).concat("&f=all");
|
|
|
searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
|
|
searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
|
|
|
itembSelects = searchDocument.select("div.movie-list").select("div.item");
|
|
itembSelects = searchDocument.select("div.movie-list").select("div.item");
|
|
|
- }
|
|
|
|
|
|
|
+ if (itembSelects.size() != 0) {
|
|
|
|
|
+ for (Element itembSelect : itembSelects) {
|
|
|
|
|
+ title = itembSelect.select("a.box").get(0).attr("title");
|
|
|
|
|
+ if (title.contains(crawlerLoveFoot.getName())) {
|
|
|
|
|
+ codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
|
|
|
|
|
+ break subsearch;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ newName = crawlerLoveFoot.getName().replace("●", "さ");
|
|
|
|
|
+ if (title.contains(newName)) {
|
|
|
|
|
+ codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
|
|
|
|
|
+ crawlerLoveFoot.setName(newName);
|
|
|
|
|
+ break subsearch;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ newName = crawlerLoveFoot.getName().replace("●", "這");
|
|
|
|
|
+ if (title.contains(newName)) {
|
|
|
|
|
+ codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
|
|
|
|
|
+ crawlerLoveFoot.setName(newName);
|
|
|
|
|
+ break subsearch;
|
|
|
|
|
+ }
|
|
|
|
|
|
|
|
- if (itembSelects.size() == 0) {
|
|
|
|
|
- throw new BusinessException(30000, "javdb search result null");
|
|
|
|
|
|
|
+ newName = crawlerLoveFoot.getName().replace("○", "〇");
|
|
|
|
|
+ if (title.contains(newName)) {
|
|
|
|
|
+ codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
|
|
|
|
|
+ crawlerLoveFoot.setName(newName);
|
|
|
|
|
+ break subsearch;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- // 获取codeUrl
|
|
|
|
|
- String codeUrl = null;
|
|
|
|
|
- String title;
|
|
|
|
|
-
|
|
|
|
|
if ("javbus".equals(website)) {
|
|
if ("javbus".equals(website)) {
|
|
|
for (Element itembSelect : itembSelects) {
|
|
for (Element itembSelect : itembSelects) {
|
|
|
title = itembSelect.select("a.movie-box").get(0).select("div.photo-frame > img").attr("title");
|
|
title = itembSelect.select("a.movie-box").get(0).select("div.photo-frame > img").attr("title");
|
|
@@ -333,20 +413,6 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
throw new BusinessException(30000, "javbus search result mismatch");
|
|
throw new BusinessException(30000, "javbus search result mismatch");
|
|
|
}
|
|
}
|
|
|
} else if ("javdb".equals(website)) {
|
|
} else if ("javdb".equals(website)) {
|
|
|
- for (Element itembSelect : itembSelects) {
|
|
|
|
|
- title = itembSelect.select("a.box").get(0).attr("title");
|
|
|
|
|
- if (title.contains(crawlerLoveFoot.getName())) {
|
|
|
|
|
- codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
|
|
|
|
|
- break;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- String newName = crawlerLoveFoot.getName().replace("●", "さ");
|
|
|
|
|
- if (title.contains(newName)) {
|
|
|
|
|
- codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
|
|
|
|
|
- crawlerLoveFoot.setName(newName);
|
|
|
|
|
- break;
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
if (StringUtils.isEmpty(codeUrl)) {
|
|
if (StringUtils.isEmpty(codeUrl)) {
|
|
|
throw new BusinessException(30000, "javdb search result mismatch");
|
|
throw new BusinessException(30000, "javdb search result mismatch");
|
|
|
}
|
|
}
|
|
@@ -477,7 +543,7 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
long start = System.currentTimeMillis();
|
|
long start = System.currentTimeMillis();
|
|
|
Connection.Response response = Jsoup.connect(href).method(Connection.Method.GET).ignoreContentType(true).timeout(50 * 1000).execute();
|
|
Connection.Response response = Jsoup.connect(href).method(Connection.Method.GET).ignoreContentType(true).timeout(50 * 1000).execute();
|
|
|
|
|
|
|
|
- String fileName = issueDate.concat(" ").concat(iCode).concat(" ").concat(crawlerLoveFoot.getName());
|
|
|
|
|
|
|
+ String fileName = issueDate.concat(" ").concat(iCode).concat(" ").concat(StringUtils.escapeJavParam(crawlerLoveFoot.getName()));
|
|
|
byte[] imgUrlBytes = fileName.getBytes(StandardCharsets.UTF_8);
|
|
byte[] imgUrlBytes = fileName.getBytes(StandardCharsets.UTF_8);
|
|
|
if (imgUrlBytes.length > 251) {
|
|
if (imgUrlBytes.length > 251) {
|
|
|
byte[] imgUrlDestBytes = new byte[251];
|
|
byte[] imgUrlDestBytes = new byte[251];
|
|
@@ -517,15 +583,28 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
header2Map.put("referer", avnoashiUrl.concat("?sort=newer"));
|
|
header2Map.put("referer", avnoashiUrl.concat("?sort=newer"));
|
|
|
Document loveFootDocument;
|
|
Document loveFootDocument;
|
|
|
Document loveFootDetailDocument;
|
|
Document loveFootDetailDocument;
|
|
|
|
|
+ boolean tiaoguoFlag = true;
|
|
|
|
|
+ int pageNum = 0;
|
|
|
outer:
|
|
outer:
|
|
|
while (true) {
|
|
while (true) {
|
|
|
loveFootDocument = JsoupUtil.requestDocument(avnoashiUrl, JsoupUtil.HTTP_GET, proxy, null, headerMap, null);
|
|
loveFootDocument = JsoupUtil.requestDocument(avnoashiUrl, JsoupUtil.HTTP_GET, proxy, null, headerMap, null);
|
|
|
|
|
+
|
|
|
|
|
+ pageNum++;
|
|
|
|
|
+ if (pageNum > 50) {
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
log.warn("jsoupLoveFoot4avnoashiSub page success:url={}", avnoashiUrl);
|
|
log.warn("jsoupLoveFoot4avnoashiSub page success:url={}", avnoashiUrl);
|
|
|
|
|
|
|
|
Elements sourceSelects = loveFootDocument.select("div.dividerBottom > div.archive").select("div.archive__contents").select("h2");
|
|
Elements sourceSelects = loveFootDocument.select("div.dividerBottom > div.archive").select("div.archive__contents").select("h2");
|
|
|
for (Element sourceSelect : sourceSelects) {
|
|
for (Element sourceSelect : sourceSelects) {
|
|
|
String sourceUrl = sourceSelect.select("a").attr("abs:href");
|
|
String sourceUrl = sourceSelect.select("a").attr("abs:href");
|
|
|
|
|
|
|
|
|
|
+ if (tiaoguoFlag) {
|
|
|
|
|
+ tiaoguoFlag = false;
|
|
|
|
|
+ continue;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
Integer statusInt = 2;
|
|
Integer statusInt = 2;
|
|
|
Integer typeInt = 1;
|
|
Integer typeInt = 1;
|
|
|
LocalDate clockDate = null;
|
|
LocalDate clockDate = null;
|
|
@@ -533,14 +612,14 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
String keywords = null;
|
|
String keywords = null;
|
|
|
try {
|
|
try {
|
|
|
loveFootDetailDocument = JsoupUtil.requestDocument(sourceUrl, JsoupUtil.HTTP_GET, proxy, null, header2Map, null);
|
|
loveFootDetailDocument = JsoupUtil.requestDocument(sourceUrl, JsoupUtil.HTTP_GET, proxy, null, header2Map, null);
|
|
|
- String clockDateStr = loveFootDetailDocument.select("div.viral").select("li.icon-clock").text();
|
|
|
|
|
|
|
+ /*String clockDateStr = loveFootDetailDocument.select("div.viral").select("li.icon-clock").text();
|
|
|
String updateDateStr = loveFootDetailDocument.select("div.viral").select("li.icon-update").text();
|
|
String updateDateStr = loveFootDetailDocument.select("div.viral").select("li.icon-update").text();
|
|
|
clockDate = LocalDate.parse(clockDateStr, DateUtils.dateFormatter3);
|
|
clockDate = LocalDate.parse(clockDateStr, DateUtils.dateFormatter3);
|
|
|
updateDate = LocalDate.parse(updateDateStr, DateUtils.dateFormatter3);
|
|
updateDate = LocalDate.parse(updateDateStr, DateUtils.dateFormatter3);
|
|
|
|
|
|
|
|
if (updateDate.isBefore(latestDate) || updateDate.isEqual(latestDate)) {
|
|
if (updateDate.isBefore(latestDate) || updateDate.isEqual(latestDate)) {
|
|
|
break outer;
|
|
break outer;
|
|
|
- }
|
|
|
|
|
|
|
+ }*/
|
|
|
|
|
|
|
|
// 获取关键词
|
|
// 获取关键词
|
|
|
keywords = loveFootDetailDocument.select("div.postContents").select("td:contains(タイトル)").next("td").text();
|
|
keywords = loveFootDetailDocument.select("div.postContents").select("td:contains(タイトル)").next("td").text();
|
|
@@ -567,20 +646,26 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
throw new Exception(message);
|
|
throw new Exception(message);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- crawlerLoveFootMapper.insertOrUpdate4avnoashi(crawlerLoveFoot);
|
|
|
|
|
|
|
+ CrawlerLoveFoot exist = crawlerLoveFootMapper.findLoveFootByOrginAvnoashiUrl(sourceUrl);
|
|
|
|
|
+ if (exist == null || exist.getStatus() != 3) {
|
|
|
|
|
+ crawlerLoveFootMapper.insertOrUpdate4avnoashi(crawlerLoveFoot);
|
|
|
|
|
+ }
|
|
|
} catch (Exception e) {
|
|
} catch (Exception e) {
|
|
|
- log.error("jsoupLoveFoot4avnoashiSub detail fail,sourceUrl={}", sourceUrl, e);
|
|
|
|
|
- CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
|
|
|
|
|
- crawlerLoveFoot.setIdentificationCode(UUID.randomUUID().toString());
|
|
|
|
|
- crawlerLoveFoot.setOrginAvnoashiUrl(sourceUrl);
|
|
|
|
|
- crawlerLoveFoot.setClockDate(clockDate);
|
|
|
|
|
- crawlerLoveFoot.setUpdateDate(updateDate);
|
|
|
|
|
- crawlerLoveFoot.setName(keywords);
|
|
|
|
|
- crawlerLoveFoot.setType(typeInt);
|
|
|
|
|
- crawlerLoveFoot.setStatus(statusInt);
|
|
|
|
|
- crawlerLoveFoot.setCreateTime(LocalDateTime.now());
|
|
|
|
|
- crawlerLoveFoot.setFailureCause(e.getMessage());
|
|
|
|
|
- crawlerLoveFootMapper.insertOrUpdate4avnoashi(crawlerLoveFoot);
|
|
|
|
|
|
|
+ Integer exist = crawlerLoveFootMapper.existLoveFootByOrginAvnoashiUrl(sourceUrl);
|
|
|
|
|
+ if (exist == null) {
|
|
|
|
|
+ log.error("jsoupLoveFoot4avnoashiSub detail fail,sourceUrl={}", sourceUrl, e);
|
|
|
|
|
+ CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
|
|
|
|
|
+ crawlerLoveFoot.setIdentificationCode(UUID.randomUUID().toString());
|
|
|
|
|
+ crawlerLoveFoot.setOrginAvnoashiUrl(sourceUrl);
|
|
|
|
|
+ crawlerLoveFoot.setClockDate(clockDate);
|
|
|
|
|
+ crawlerLoveFoot.setUpdateDate(updateDate);
|
|
|
|
|
+ crawlerLoveFoot.setName(keywords);
|
|
|
|
|
+ crawlerLoveFoot.setType(typeInt);
|
|
|
|
|
+ crawlerLoveFoot.setStatus(statusInt);
|
|
|
|
|
+ crawlerLoveFoot.setCreateTime(LocalDateTime.now());
|
|
|
|
|
+ crawlerLoveFoot.setFailureCause(e.getMessage());
|
|
|
|
|
+ crawlerLoveFootMapper.insertOrUpdate4avnoashi(crawlerLoveFoot);
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -699,6 +784,11 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
long start = System.currentTimeMillis();
|
|
long start = System.currentTimeMillis();
|
|
|
Elements itembSelects = null;
|
|
Elements itembSelects = null;
|
|
|
try {
|
|
try {
|
|
|
|
|
+ Thread.sleep(3000);
|
|
|
|
|
+
|
|
|
|
|
+ // 获取codeUrl
|
|
|
|
|
+ String codeUrl = null;
|
|
|
|
|
+ String title;
|
|
|
if ("javbus".equals(website)) {
|
|
if ("javbus".equals(website)) {
|
|
|
String javbusUrl = javbusUrlList.get((int) (0 + Math.random() * (javbusUrlList.size())));
|
|
String javbusUrl = javbusUrlList.get((int) (0 + Math.random() * (javbusUrlList.size())));
|
|
|
String javbusSearchUrl = javbusUrl.concat("/search/").concat(keywords).concat("&parent=ce");
|
|
String javbusSearchUrl = javbusUrl.concat("/search/").concat(keywords).concat("&parent=ce");
|
|
@@ -747,28 +837,83 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
throw new BusinessException(30000, "javbus search result null");
|
|
throw new BusinessException(30000, "javbus search result null");
|
|
|
}
|
|
}
|
|
|
} else if ("javdb".equals(website)) {
|
|
} else if ("javdb".equals(website)) {
|
|
|
|
|
+ crawlerLoveFoot.setName(crawlerLoveFoot.getName().replace("%", "%").replace("#", "#").replace("?", "?"));
|
|
|
|
|
+
|
|
|
String searchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(crawlerLoveFoot.getName()).concat("&f=all");
|
|
String searchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(crawlerLoveFoot.getName()).concat("&f=all");
|
|
|
header3Map.put("referer", searchUrl);
|
|
header3Map.put("referer", searchUrl);
|
|
|
|
|
|
|
|
- javdbSearchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
|
|
|
|
|
|
|
+ subsearch:
|
|
|
|
|
+ {
|
|
|
|
|
+ javdbSearchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
|
|
|
|
|
+ itembSelects = javdbSearchDocument.select("div.movie-list").select("div.item");
|
|
|
|
|
+ if (itembSelects.size() != 0) {
|
|
|
|
|
+ for (Element itembSelect : itembSelects) {
|
|
|
|
|
+ title = itembSelect.select("a.box").get(0).attr("title");
|
|
|
|
|
+ if (title.contains(crawlerLoveFoot.getName())) {
|
|
|
|
|
+ codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
|
|
|
|
|
+ break subsearch;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ String newName = crawlerLoveFoot.getName().replace("●", "さ");
|
|
|
|
|
+ if (title.contains(newName)) {
|
|
|
|
|
+ codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
|
|
|
|
|
+ crawlerLoveFoot.setName(newName);
|
|
|
|
|
+ break subsearch;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ newName = crawlerLoveFoot.getName().replace("●", "這");
|
|
|
|
|
+ if (title.contains(newName)) {
|
|
|
|
|
+ codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
|
|
|
|
|
+ crawlerLoveFoot.setName(newName);
|
|
|
|
|
+ break subsearch;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ newName = crawlerLoveFoot.getName().replace("○", "〇");
|
|
|
|
|
+ if (title.contains(newName)) {
|
|
|
|
|
+ codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
|
|
|
|
|
+ crawlerLoveFoot.setName(newName);
|
|
|
|
|
+ break subsearch;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
|
|
|
- itembSelects = javdbSearchDocument.select("div.movie-list").select("div.item");
|
|
|
|
|
- if (itembSelects.size() == 0) {
|
|
|
|
|
String newName = crawlerLoveFoot.getName().substring(crawlerLoveFoot.getName().length() / 2);
|
|
String newName = crawlerLoveFoot.getName().substring(crawlerLoveFoot.getName().length() / 2);
|
|
|
searchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(newName).concat("&f=all");
|
|
searchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(newName).concat("&f=all");
|
|
|
javdbSearchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
|
|
javdbSearchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
|
|
|
itembSelects = javdbSearchDocument.select("div.movie-list").select("div.item");
|
|
itembSelects = javdbSearchDocument.select("div.movie-list").select("div.item");
|
|
|
- }
|
|
|
|
|
|
|
+ if (itembSelects.size() != 0) {
|
|
|
|
|
+ for (Element itembSelect : itembSelects) {
|
|
|
|
|
+ title = itembSelect.select("a.box").get(0).attr("title");
|
|
|
|
|
+ if (title.contains(crawlerLoveFoot.getName())) {
|
|
|
|
|
+ codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
|
|
|
|
|
+ break subsearch;
|
|
|
|
|
+ }
|
|
|
|
|
|
|
|
- if (itembSelects.size() == 0) {
|
|
|
|
|
- throw new BusinessException(30000, "javdb search result null");
|
|
|
|
|
|
|
+ newName = crawlerLoveFoot.getName().replace("●", "さ");
|
|
|
|
|
+ if (title.contains(newName)) {
|
|
|
|
|
+ codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
|
|
|
|
|
+ crawlerLoveFoot.setName(newName);
|
|
|
|
|
+ break subsearch;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ newName = crawlerLoveFoot.getName().replace("●", "這");
|
|
|
|
|
+ if (title.contains(newName)) {
|
|
|
|
|
+ codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
|
|
|
|
|
+ crawlerLoveFoot.setName(newName);
|
|
|
|
|
+ break subsearch;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ newName = crawlerLoveFoot.getName().replace("○", "〇");
|
|
|
|
|
+ if (title.contains(newName)) {
|
|
|
|
|
+ codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
|
|
|
|
|
+ crawlerLoveFoot.setName(newName);
|
|
|
|
|
+ break subsearch;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- // 获取codeUrl
|
|
|
|
|
- String codeUrl = null;
|
|
|
|
|
- String title;
|
|
|
|
|
-
|
|
|
|
|
if ("javbus".equals(website)) {
|
|
if ("javbus".equals(website)) {
|
|
|
for (Element itembSelect : itembSelects) {
|
|
for (Element itembSelect : itembSelects) {
|
|
|
title = itembSelect.select("a.movie-box").get(0).select("div.photo-frame > img").attr("title");
|
|
title = itembSelect.select("a.movie-box").get(0).select("div.photo-frame > img").attr("title");
|
|
@@ -802,20 +947,6 @@ public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
|
|
|
throw new BusinessException(30000, "javbus search result mismatch");
|
|
throw new BusinessException(30000, "javbus search result mismatch");
|
|
|
}
|
|
}
|
|
|
} else if ("javdb".equals(website)) {
|
|
} else if ("javdb".equals(website)) {
|
|
|
- for (Element itembSelect : itembSelects) {
|
|
|
|
|
- title = itembSelect.select("a.box").get(0).attr("title");
|
|
|
|
|
- if (title.contains(crawlerLoveFoot.getName())) {
|
|
|
|
|
- codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
|
|
|
|
|
- break;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- String newName = crawlerLoveFoot.getName().replace("●", "さ");
|
|
|
|
|
- if (title.contains(newName)) {
|
|
|
|
|
- codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
|
|
|
|
|
- crawlerLoveFoot.setName(newName);
|
|
|
|
|
- break;
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
if (StringUtils.isEmpty(codeUrl)) {
|
|
if (StringUtils.isEmpty(codeUrl)) {
|
|
|
throw new BusinessException(30000, "javdb search result mismatch");
|
|
throw new BusinessException(30000, "javdb search result mismatch");
|
|
|
}
|
|
}
|