| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909 |
- package top.lvzhiqiang.service.impl;
- import lombok.extern.slf4j.Slf4j;
- import org.jsoup.Connection;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- import org.springframework.beans.factory.annotation.Value;
- import org.springframework.scheduling.annotation.Async;
- import org.springframework.stereotype.Service;
- import org.springframework.transaction.annotation.Propagation;
- import org.springframework.transaction.annotation.Transactional;
- import org.springframework.util.StopWatch;
- import top.lvzhiqiang.entity.CrawlerLoveFoot;
- import top.lvzhiqiang.entity.DicCode;
- import top.lvzhiqiang.exception.BusinessException;
- import top.lvzhiqiang.mapper.CrawlerLoveFootMapper;
- import top.lvzhiqiang.mapper.DicCodeMapper;
- import top.lvzhiqiang.mapper.VideoSitePoolMapper;
- import top.lvzhiqiang.service.Crawler4LoveFootService;
- import top.lvzhiqiang.util.DateUtils;
- import top.lvzhiqiang.util.JsoupUtil;
- import top.lvzhiqiang.util.StringUtils;
- import javax.annotation.Resource;
- import java.io.*;
- import java.net.InetSocketAddress;
- import java.net.Proxy;
- import java.nio.charset.StandardCharsets;
- import java.time.LocalDate;
- import java.time.LocalDateTime;
- import java.util.HashMap;
- import java.util.List;
- import java.util.Map;
- import java.util.UUID;
- import java.util.stream.Collectors;
- /**
- * Crawler LoveFoot ServiceImpl
- *
- * @author lvzhiqiang
- * 2022/10/17 14:47
- */
- @Service
- @Slf4j
- public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
- @Resource
- private DicCodeMapper dicCodeMapper;
- @Resource
- private CrawlerLoveFootMapper crawlerLoveFootMapper;
- @Resource
- private VideoSitePoolMapper videoSitePoolMapper;
- @Value("${spring.profiles.active}")
- private String env;
- Map<String, String> footConstantMap = null;
- Map<String, String> javbusConstantMap = null;
- Map<String, String> javdbConstantMap = null;
- List<String> javbusUrlList = null;
- Map<String, String> headerMap = new HashMap<>();
- Map<String, String> header2Map = new HashMap<>();
- Map<String, String> header3Map = new HashMap<>();
- Proxy proxy = null;
- public void beforeProxy() {
- if (null == proxy) {
- if ("dev".equals(env)) {
- proxy = new Proxy(Proxy.Type.SOCKS, new InetSocketAddress("127.0.0.1", 1080));
- } else {
- proxy = Proxy.NO_PROXY;
- }
- }
- }
- @Async
- @Override
- @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
- public void jsoupLoveFoot4avnoashi(Integer status, Integer isDel, Integer ignoreRetryCount) throws Exception {
- log.warn("jsoupFoot4avnoashi 开始:status={},isDel={},ignoreRetryCount={}", status, isDel, ignoreRetryCount);
- StopWatch stopWatch = new StopWatch();
- stopWatch.start();
- if (isDel == 1) {
- crawlerLoveFootMapper.deleteAll();
- }
- List<DicCode> dicCodeList = dicCodeMapper.findAll();
- // 获取常量MAP
- footConstantMap = dicCodeList.stream()
- .filter(x -> "foot".equals(x.getCodeDesc()) && x.getEnv().contains(env))
- .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
- javbusConstantMap = dicCodeList.stream()
- .filter(x -> x.getType() != null && 1 == x.getType() && x.getEnv().contains(env))
- .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
- // 获取javbus防屏蔽地址
- javbusUrlList = videoSitePoolMapper.findUrlByTypeAndDeleteFlag(1, 1);
- if (javbusUrlList.size() == 0) {
- log.warn("javbusUrlList为空");
- return;
- }
- // 代理及TOKEN设置
- beforeProxy();
- // 解析原始站点
- jsoupLoveFoot4avnoashiSub(status, ignoreRetryCount);
- log.warn("jsoupFoot4avnoashi 结束:time={}", stopWatch.getTotalTimeSeconds());
- }
- @Async
- @Override
- @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
- public void jsoupLoveFoot4jpfoot(Integer status, Integer isDel, Integer ignoreRetryCount) throws Exception {
- log.warn("jsoupLoveFoot4jpfoot 开始:status={},isDel={},ignoreRetryCount={}", status, isDel, ignoreRetryCount);
- StopWatch stopWatch = new StopWatch();
- stopWatch.start();
- if (isDel == 1) {
- crawlerLoveFootMapper.deleteAll();
- }
- List<DicCode> dicCodeList = dicCodeMapper.findAll();
- // 获取常量MAP
- footConstantMap = dicCodeList.stream()
- .filter(x -> "foot".equals(x.getCodeDesc()) && x.getEnv().contains(env))
- .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
- javbusConstantMap = dicCodeList.stream()
- .filter(x -> x.getType() != null && 1 == x.getType() && x.getEnv().contains(env))
- .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
- // 获取javbus防屏蔽地址
- javbusUrlList = videoSitePoolMapper.findUrlByTypeAndDeleteFlag(1, 1);
- if (javbusUrlList.size() == 0) {
- log.warn("javbusUrlList为空");
- return;
- }
- // 代理及TOKEN设置
- beforeProxy();
- // 解析原始站点
- jsoupLoveFoot4jpfootSub(status, ignoreRetryCount);
- log.warn("jsoupLoveFoot4jpfoot 结束:time={}", stopWatch.getTotalTimeSeconds());
- }
- @Async
- @Override
- @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
- public void jsoupLoveFoot4CrawingFail(Integer status, Integer ignoreRetryCount, String website) {
- log.warn("jjsoupLoveFoot4CrawingFail 开始");
- StopWatch stopWatch = new StopWatch();
- stopWatch.start();
- // 获取待抓取码列表
- List<CrawlerLoveFoot> loveFootList;
- if (1 == ignoreRetryCount) {
- loveFootList = crawlerLoveFootMapper.findInfoByStatus4IgnoreRetryCount(status);
- } else {
- loveFootList = crawlerLoveFootMapper.findInfoByStatus(status);
- }
- if (loveFootList.size() == 0) {
- log.warn("loveFootList为空");
- return;
- }
- log.warn("jsoupLoveFoot4CrawingFail loveFootList size={}", loveFootList.size());
- List<DicCode> dicCodeList = dicCodeMapper.findAll();
- // 获取常量MAP
- javbusConstantMap = dicCodeList.stream()
- .filter(x -> x.getType() != null && 1 == x.getType() && x.getEnv().contains(env))
- .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
- javdbConstantMap = dicCodeList.stream()
- .filter(x -> x.getType() != null && 2 == x.getType() && x.getEnv().contains(env))
- .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
- // 获取javbus防屏蔽地址
- if ("javbus".equals(website)) {
- javbusUrlList = videoSitePoolMapper.findUrlByTypeAndDeleteFlag(1, 1);
- if (javbusUrlList.size() == 0) {
- log.warn("javbusUrlList为空");
- return;
- }
- }
- // 代理及TOKEN设置
- beforeProxy();
- // 解析原始站点
- int successCount = jsoupLoveFoot4CrawingFailSub(loveFootList, website);
- log.warn("jjsoupLoveFoot4CrawingFail 结束:totalCount={},successCount={},time={}", loveFootList.size(), successCount, stopWatch.getTotalTimeSeconds());
- }
- @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
- public int jsoupLoveFoot4CrawingFailSub(List<CrawlerLoveFoot> loveFootList, String website) {
- int successCount = 0;
- for (CrawlerLoveFoot crawlerLoveFoot : loveFootList) {
- Document searchDocument = null;
- Document codeDocument;
- String message = null;
- int retryCount = 0;
- while (retryCount <= 3) {
- long start = System.currentTimeMillis();
- String searchUrl = null;
- Elements itembSelects = null;
- try {
- String javbusCodeUrl = null;
- if ("javbus".equals(website)) {
- String javbusUrl = javbusUrlList.get((int) (0 + Math.random() * (javbusUrlList.size())));
- searchUrl = javbusUrl.concat("/search/").concat(crawlerLoveFoot.getName()).concat("&parent=ce");
- try {
- searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- } catch (Exception ee) {
- String newName = crawlerLoveFoot.getName().substring(crawlerLoveFoot.getName().length() / 2);
- searchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
- try {
- searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- } catch (Exception eee) {
- newName = newName.substring(newName.length() / 2);
- searchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
- try {
- searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- } catch (Exception eeee) {
- // throw new BusinessException(30000, "javbus search result null");
- }
- }
- }
- if (null == searchDocument) {
- String newName = crawlerLoveFoot.getName().replace("●", "");
- searchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
- try {
- searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- } catch (Exception ee) {
- newName = newName.substring(0, newName.length() / 2);
- searchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
- try {
- searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- } catch (Exception eee) {
- newName = newName.substring(0, newName.length() / 2);
- searchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
- try {
- searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- } catch (Exception eeee) {
- // throw new BusinessException(30000, "javbus search result null");
- }
- }
- }
- }
- if (null == searchDocument) {
- if (crawlerLoveFoot.getIdentificationCode().length() < 32) {
- javbusCodeUrl = javbusUrl.concat("/").concat(crawlerLoveFoot.getIdentificationCode());
- itembSelects = new Elements();
- } else {
- throw new BusinessException(30000, "javbus search result null");
- }
- } else {
- itembSelects = searchDocument.select("div#waterfall").select("div.item");
- if (itembSelects.size() == 0) {
- throw new BusinessException(30000, "javbus search result null");
- }
- if (crawlerLoveFoot.getIdentificationCode().length() < 32) {
- javbusCodeUrl = javbusUrl.concat("/").concat(crawlerLoveFoot.getIdentificationCode());
- }
- }
- } else if ("javdb".equals(website)) {
- searchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(crawlerLoveFoot.getName()).concat("&f=all");
- header3Map.put("referer", searchUrl);
- searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
- itembSelects = searchDocument.select("div.movie-list").select("div.item");
- if (itembSelects.size() == 0) {
- String newName = crawlerLoveFoot.getName().substring(crawlerLoveFoot.getName().length() / 2);
- searchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(newName).concat("&f=all");
- searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
- itembSelects = searchDocument.select("div.movie-list").select("div.item");
- }
- if (itembSelects.size() == 0) {
- throw new BusinessException(30000, "javdb search result null");
- }
- }
- // 获取codeUrl
- String codeUrl = null;
- String title;
- if ("javbus".equals(website)) {
- for (Element itembSelect : itembSelects) {
- title = itembSelect.select("a.movie-box").get(0).select("div.photo-frame > img").attr("title");
- if (title.contains(crawlerLoveFoot.getName())) {
- codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
- break;
- }
- String newName = crawlerLoveFoot.getName().replace("●", "さ");
- if (title.contains(newName)) {
- codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
- crawlerLoveFoot.setName(newName);
- break;
- }
- String[] newNameArr = crawlerLoveFoot.getName().split("●");
- int matchCount = 0;
- for (String s : newNameArr) {
- if (title.contains(s)) {
- matchCount++;
- }
- }
- if (newNameArr.length == matchCount) {
- codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
- crawlerLoveFoot.setName(title);
- break;
- }
- }
- if (StringUtils.isEmpty(codeUrl) && StringUtils.isNotEmpty(javbusCodeUrl)) {
- codeUrl = javbusCodeUrl;
- crawlerLoveFoot.setChangeTitleFlag(1);
- }
- if (StringUtils.isEmpty(codeUrl)) {
- throw new BusinessException(30000, "javbus search result mismatch");
- }
- } else if ("javdb".equals(website)) {
- for (Element itembSelect : itembSelects) {
- title = itembSelect.select("a.box").get(0).attr("title");
- if (title.contains(crawlerLoveFoot.getName())) {
- codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
- break;
- }
- String newName = crawlerLoveFoot.getName().replace("●", "さ");
- if (title.contains(newName)) {
- codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
- crawlerLoveFoot.setName(newName);
- break;
- }
- }
- if (StringUtils.isEmpty(codeUrl)) {
- throw new BusinessException(30000, "javdb search result mismatch");
- }
- }
- // 解析codeUrl
- long picTime = 999;
- if ("javbus".equals(website)) {
- codeDocument = JsoupUtil.requestDocument(codeUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- picTime = parseJavbusCodeDocument(codeDocument, crawlerLoveFoot);
- } else if ("javdb".equals(website)) {
- codeDocument = JsoupUtil.requestDocument(codeUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
- picTime = parseJavdbCodeDocument(codeDocument, crawlerLoveFoot);
- crawlerLoveFoot.setJavdbUrl(codeUrl);
- }
- crawlerLoveFoot.setRetryCount(retryCount);
- crawlerLoveFoot.setType(2);
- crawlerLoveFoot.setStatus(3);
- log.warn("jsoupLoveFoot4CrawingFailSub parseKeywordsToCode success,keywords={},code={},picTime={},time={}", crawlerLoveFoot.getName(), crawlerLoveFoot.getIdentificationCode(), picTime, System.currentTimeMillis() - start);
- break;
- } catch (Exception e) {
- ++retryCount;
- if (retryCount < 4) {
- log.error("jsoupLoveFoot4CrawingFailSub error重试:,retryCount={},time={},javdbSearchUrl={}", retryCount, System.currentTimeMillis() - start, searchUrl, e);
- } else if (retryCount == 4) {
- message = e.getMessage().length() <= 200 ? e.getMessage() : e.getMessage().substring(0, 200);
- }
- if (e instanceof BusinessException) {
- message = e.getMessage().length() <= 200 ? e.getMessage() : e.getMessage().substring(0, 200);
- break;
- }
- }
- }
- if (StringUtils.isNotEmpty(message)) {
- CrawlerLoveFoot crawlerLoveFoot2 = new CrawlerLoveFoot();
- crawlerLoveFoot2.setId(crawlerLoveFoot.getId());
- crawlerLoveFoot2.setFailureCause(message);
- crawlerLoveFoot2.setRetryCount(retryCount);
- crawlerLoveFootMapper.updateInfoById(crawlerLoveFoot2);
- } else {
- crawlerLoveFoot.setFailureCause("");
- crawlerLoveFootMapper.updateInfoById(crawlerLoveFoot);
- successCount++;
- }
- }
- return successCount;
- }
- private long parseJavdbCodeDocument(Document javdbCodeDocument, CrawlerLoveFoot crawlerLoveFoot) throws IOException {
- Elements container = javdbCodeDocument.select("section.section > div.container");
- if (container.size() == 0) {
- throw new BusinessException(30000, "番号无效!");
- }
- Elements videoDetail = container.select("div.video-detail");
- // 名称
- // crawlerLoveFoot.setName(videoDetail.select("h2.title").select("strong.current-title").text().trim());
- Elements moviePanelInfos = videoDetail.select("nav.movie-panel-info");
- Element pEle = moviePanelInfos.get(0);
- // 识别码
- String iCode = pEle.select("div:contains(番號)").select("span.value").first().text().replace(" ", "").replaceAll("\\s+", "");
- crawlerLoveFoot.setIdentificationCode(iCode);
- // 发行日期
- String issueDate = pEle.select("div:contains(日期)").select("span.value").first().text().replace(" ", "").replaceAll("\\s+", "");
- crawlerLoveFoot.setIssueDate(LocalDate.parse(issueDate, DateUtils.dateFormatter));
- // 长度
- String length = pEle.select("div:contains(時長)").select("span.value").first().text().replace(" ", "").replaceAll("\\s+", "");
- crawlerLoveFoot.setLength(length);
- // 导演
- Elements directorEles = pEle.select("div:contains(導演)").select("span.value");
- if (directorEles.size() > 0) {
- crawlerLoveFoot.setDirector(directorEles.first().select("a[href]").first().text().replace(" ", "").replaceAll("\\s+", ""));
- }
- // 制作商/片商
- Elements markerEles = pEle.select("div:contains(片商)").select("span.value");
- if (markerEles.size() > 0) {
- crawlerLoveFoot.setMaker(markerEles.first().select("a[href]").first().text().replace(" ", "").replaceAll("\\s+", ""));
- }
- // 发行商
- Elements issuerEles = pEle.select("div:contains(發行)").select("span.value");
- if (issuerEles.size() > 0) {
- crawlerLoveFoot.setIssuer(issuerEles.first().select("a[href]").first().text().replace(" ", "").replaceAll("\\s+", ""));
- }
- // 类别
- Elements genresEles = pEle.select("div:contains(類別)").select("span.value");
- if (genresEles.size() > 0) {
- StringBuffer sb = new StringBuffer();
- Elements ahrefEles = genresEles.first().select("a[href]");
- for (Element ahrefEle : ahrefEles) {
- sb.append(ahrefEle.text().replace(" ", "").replaceAll("\\s+", "")).append(",");
- }
- if (sb.length() > 0) {
- sb = sb.deleteCharAt(sb.length() - 1);
- }
- crawlerLoveFoot.setGenres(sb.toString());
- }
- // 演员
- Elements castEles = pEle.select("div:contains(演員)").select("span.value");
- if (castEles.size() > 0) {
- StringBuffer sb = new StringBuffer();
- Elements ahrefEles = castEles.first().select("a[href]");
- for (Element ahrefEle : ahrefEles) {
- sb.append(ahrefEle.text().replace(" ", "").replaceAll("\\s+", "")).append(",");
- }
- if (sb.length() > 0) {
- sb = sb.deleteCharAt(sb.length() - 1);
- }
- crawlerLoveFoot.setCast(sb.toString());
- }
- // 图片URL
- Elements videoMetaPanel = videoDetail.select("div.column-video-cover");
- String href = videoMetaPanel.select("a > img").first().attr("src");
- long start = System.currentTimeMillis();
- Connection.Response response = Jsoup.connect(href).method(Connection.Method.GET).ignoreContentType(true).timeout(50 * 1000).execute();
- String fileName = issueDate.concat(" ").concat(iCode).concat(" ").concat(crawlerLoveFoot.getName());
- byte[] imgUrlBytes = fileName.getBytes(StandardCharsets.UTF_8);
- if (imgUrlBytes.length > 251) {
- byte[] imgUrlDestBytes = new byte[251];
- System.arraycopy(imgUrlBytes, 0, imgUrlDestBytes, 0, 251);
- fileName = new String(imgUrlDestBytes, StandardCharsets.UTF_8).replace("�", "");
- }
- fileName = fileName.concat(".jpg");
- String machiImgUrl = "足舐/".concat(fileName);
- saveFile(response.bodyStream(), javbusConstantMap.get("apics_path").concat(machiImgUrl));
- long end = System.currentTimeMillis();
- crawlerLoveFoot.setImgUrl(machiImgUrl);
- return end - start;
- }
- @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
- public void jsoupLoveFoot4avnoashiSub(Integer status, Integer ignoreRetryCount) throws Exception {
- CrawlerLoveFoot latestLoveFoot = crawlerLoveFootMapper.findLatestInfo4avnoashi();
- LocalDate latestDate;
- if (latestLoveFoot == null) {
- latestDate = LocalDate.of(1970, 1, 1);
- } else {
- latestDate = latestLoveFoot.getUpdateDate();
- }
- String avnoashiUrl = footConstantMap.get("avnoashi_url");
- headerMap.put("referer", avnoashiUrl);
- header2Map.put("referer", avnoashiUrl.concat("?sort=newer"));
- Document loveFootDocument;
- Document loveFootDetailDocument;
- outer:
- while (true) {
- loveFootDocument = JsoupUtil.requestDocument(avnoashiUrl, JsoupUtil.HTTP_GET, proxy, null, headerMap, null);
- log.warn("jsoupLoveFoot4avnoashiSub page success:url={}", avnoashiUrl);
- Elements sourceSelects = loveFootDocument.select("div.dividerBottom > div.archive").select("div.archive__contents").select("h2");
- for (Element sourceSelect : sourceSelects) {
- String sourceUrl = sourceSelect.select("a").attr("abs:href");
- Integer statusInt = 2;
- Integer typeInt = 1;
- LocalDate clockDate = null;
- LocalDate updateDate = null;
- String keywords = null;
- try {
- loveFootDetailDocument = JsoupUtil.requestDocument(sourceUrl, JsoupUtil.HTTP_GET, proxy, null, header2Map, null);
- String clockDateStr = loveFootDetailDocument.select("div.viral").select("li.icon-clock").text();
- String updateDateStr = loveFootDetailDocument.select("div.viral").select("li.icon-update").text();
- clockDate = LocalDate.parse(clockDateStr, DateUtils.dateFormatter3);
- updateDate = LocalDate.parse(updateDateStr, DateUtils.dateFormatter3);
- if (updateDate.isBefore(latestDate) || updateDate.isEqual(latestDate)) {
- break outer;
- }
- // 获取关键词
- keywords = loveFootDetailDocument.select("div.postContents").select("td:contains(タイトル)").next("td").text();
- if (StringUtils.isNotEmpty(keywords)) {
- statusInt = 1;
- log.warn("jsoupLoveFoot4avnoashiSub parseDetailToKeywords success,sourceUrl={},keywords={}", sourceUrl, keywords);
- } else {
- throw new Exception("keywords is null");
- }
- // 通过关键词获取识别码
- CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
- crawlerLoveFoot.setClockDate(clockDate);
- crawlerLoveFoot.setUpdateDate(updateDate);
- crawlerLoveFoot.setOrginAvnoashiUrl(sourceUrl);
- crawlerLoveFoot.setType(2);
- crawlerLoveFoot.setStatus(3);
- crawlerLoveFoot.setCreateTime(LocalDateTime.now());
- String message = parseKeywordsToCode(crawlerLoveFoot, keywords);
- if (StringUtils.isNotEmpty(message)) {
- statusInt = 4;
- throw new Exception(message);
- }
- crawlerLoveFootMapper.insertOrUpdate4avnoashi(crawlerLoveFoot);
- } catch (Exception e) {
- log.error("jsoupLoveFoot4avnoashiSub detail fail,sourceUrl={}", sourceUrl, e);
- CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
- crawlerLoveFoot.setIdentificationCode(UUID.randomUUID().toString());
- crawlerLoveFoot.setOrginAvnoashiUrl(sourceUrl);
- crawlerLoveFoot.setClockDate(clockDate);
- crawlerLoveFoot.setUpdateDate(updateDate);
- crawlerLoveFoot.setName(keywords);
- crawlerLoveFoot.setType(typeInt);
- crawlerLoveFoot.setStatus(statusInt);
- crawlerLoveFoot.setCreateTime(LocalDateTime.now());
- crawlerLoveFoot.setFailureCause(e.getMessage());
- crawlerLoveFootMapper.insertOrUpdate4avnoashi(crawlerLoveFoot);
- }
- }
- // 继续下一页
- Elements nextSelects = loveFootDocument.select("ul.pager").select("a:contains(Next)");
- if (nextSelects.size() > 0) {
- avnoashiUrl = nextSelects.get(0).attr("abs:href");
- } else {
- break;
- }
- }
- }
- @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
- public void jsoupLoveFoot4jpfootSub(Integer status, Integer ignoreRetryCount) throws Exception {
- CrawlerLoveFoot latestLoveFoot = crawlerLoveFootMapper.findLatestInfo4jpfoot();
- LocalDate latestDate;
- if (latestLoveFoot == null) {
- latestDate = LocalDate.of(1970, 1, 1);
- } else {
- latestDate = latestLoveFoot.getUpdateDate();
- }
- String jpfootUrl = footConstantMap.get("jpfoot_url");
- headerMap.put("referer", jpfootUrl);
- Document loveFootDocument;
- Document loveFootDetailDocument;
- outer:
- while (true) {
- loveFootDocument = JsoupUtil.requestDocument(jpfootUrl, JsoupUtil.HTTP_GET, proxy, null, headerMap, null);
- log.warn("jsoupLoveFoot4jpfootSub page success:url={}", jpfootUrl);
- Elements sourceSelects = loveFootDocument.select("article.mainContainer > div.av_itemGrid").select("article.av_item");
- for (Element sourceSelect : sourceSelects) {
- Thread.sleep(1000L);
- String sourceUrl = sourceSelect.select("a.av_itemLink").attr("abs:href");
- Integer statusInt = 2;
- Integer typeInt = 1;
- LocalDate clockDate = null;
- LocalDate updateDate = null;
- String keywords = null;
- try {
- loveFootDetailDocument = JsoupUtil.requestDocument(sourceUrl, JsoupUtil.HTTP_GET, proxy, null, headerMap, null);
- String dateStr = loveFootDetailDocument.select("div.avdetail_date").select("span.avdetail_dateText").text();
- clockDate = LocalDate.parse(dateStr, DateUtils.dateFormatter4);
- updateDate = clockDate;
- if (updateDate.isBefore(latestDate) || updateDate.isEqual(latestDate)) {
- break outer;
- }
- // 获取关键词
- keywords = loveFootDetailDocument.select("div.avdetail_detailTop").select("p.avdetail_detailTopTitle").text().trim();
- if (StringUtils.isNotEmpty(keywords)) {
- statusInt = 1;
- log.warn("jsoupLoveFoot4jpfootSub parseDetailToKeywords success,sourceUrl={},keywords={}", sourceUrl, keywords);
- } else {
- throw new Exception("keywords is null");
- }
- // 通过关键词获取识别码
- CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
- crawlerLoveFoot.setClockDate(clockDate);
- crawlerLoveFoot.setUpdateDate(updateDate);
- crawlerLoveFoot.setOrginJpfootUrl(sourceUrl);
- crawlerLoveFoot.setType(2);
- crawlerLoveFoot.setStatus(3);
- crawlerLoveFoot.setCreateTime(LocalDateTime.now());
- String message = parseKeywordsToCode(crawlerLoveFoot, keywords);
- if (StringUtils.isNotEmpty(message)) {
- statusInt = 4;
- throw new Exception(message);
- }
- crawlerLoveFootMapper.insertOrUpdate4jpfoot(crawlerLoveFoot);
- } catch (Exception e) {
- log.error("jsoupLoveFoot4jpfootSub detail fail,sourceUrl={}", sourceUrl, e);
- CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
- crawlerLoveFoot.setIdentificationCode(UUID.randomUUID().toString());
- crawlerLoveFoot.setOrginJpfootUrl(sourceUrl);
- crawlerLoveFoot.setClockDate(clockDate);
- crawlerLoveFoot.setUpdateDate(updateDate);
- crawlerLoveFoot.setName(keywords);
- crawlerLoveFoot.setType(typeInt);
- crawlerLoveFoot.setStatus(statusInt);
- crawlerLoveFoot.setCreateTime(LocalDateTime.now());
- crawlerLoveFoot.setFailureCause(e.getMessage());
- crawlerLoveFootMapper.insertOrUpdate4jpfoot(crawlerLoveFoot);
- }
- }
- // 继续下一页
- Elements nextSelects = loveFootDocument.select("nav.pagination > div.nav-links").select("a.next");
- if (nextSelects.size() > 0) {
- jpfootUrl = nextSelects.get(0).attr("abs:href");
- } else {
- break;
- }
- }
- }
- private String parseKeywordsToCode(CrawlerLoveFoot crawlerLoveFoot, String keywords) {
- int retryCount = 0;
- Document javbusSearchDocument = null;
- Document javbusCodeDocument;
- String message = null;
- while (retryCount <= 3) {
- long start = System.currentTimeMillis();
- try {
- String javbusUrl = javbusUrlList.get((int) (0 + Math.random() * (javbusUrlList.size())));
- String javbusSearchUrl = javbusUrl.concat("/search/").concat(keywords).concat("&parent=ce");
- try {
- javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- } catch (Exception ee) {
- String newName = keywords.substring(keywords.length() / 2);
- javbusSearchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
- try {
- javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- } catch (Exception eee) {
- newName = newName.substring(newName.length() / 2);
- javbusSearchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
- try {
- javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- } catch (Exception eeee) {
- // throw new BusinessException(30000, "javbus search result null");
- }
- }
- }
- if (null == javbusSearchDocument) {
- String newName = keywords.replace("●", "");
- javbusSearchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
- try {
- javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- } catch (Exception ee) {
- newName = newName.substring(0, newName.length() / 2);
- javbusSearchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
- try {
- javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- } catch (Exception eee) {
- newName = newName.substring(0, newName.length() / 2);
- javbusSearchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
- try {
- javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- } catch (Exception eeee) {
- throw new BusinessException(30000, "javbus search result null");
- }
- }
- }
- }
- Elements itembSelects = javbusSearchDocument.select("div#waterfall").select("div.item");
- if (itembSelects.size() == 0) {
- throw new BusinessException(30000, "javbus search result null");
- }
- // 获取codeUrl
- String codeUrl = null;
- String title;
- for (Element itembSelect : itembSelects) {
- title = itembSelect.select("a.movie-box").get(0).select("div.photo-frame > img").attr("title");
- if (title.contains(keywords)) {
- codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
- break;
- }
- String newName = keywords.replace("●", "さ");
- if (title.contains(newName)) {
- codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
- crawlerLoveFoot.setName(newName);
- break;
- }
- String[] newNameArr = keywords.split("●");
- int matchCount = 0;
- for (String s : newNameArr) {
- if (title.contains(s)) {
- matchCount++;
- }
- }
- if (newNameArr.length == matchCount) {
- codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
- crawlerLoveFoot.setName(title);
- break;
- }
- }
- if (StringUtils.isEmpty(codeUrl)) {
- throw new BusinessException(30000, "javbus search result mismatch");
- }
- // 解析codeUrl
- javbusCodeDocument = JsoupUtil.requestDocument(codeUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- long picTime = parseJavbusCodeDocument(javbusCodeDocument, crawlerLoveFoot);
- crawlerLoveFoot.setRetryCount(retryCount);
- log.warn("jsoupLoveFoot4avnoashiSub parseKeywordsToCode success,keywords={},code={},picTime={},time={}", keywords, crawlerLoveFoot.getIdentificationCode(), picTime, System.currentTimeMillis() - start);
- break;
- } catch (Exception e) {
- ++retryCount;
- if (retryCount < 4) {
- log.error("javbusSearch error重试:,retryCount={},time={},keywords={}", retryCount, System.currentTimeMillis() - start, keywords, e);
- } else if (retryCount == 4) {
- message = e.getMessage().length() <= 200 ? e.getMessage() : e.getMessage().substring(0, 200);
- }
- if (e instanceof BusinessException) {
- message = e.getMessage().length() <= 200 ? e.getMessage() : e.getMessage().substring(0, 200);
- break;
- }
- }
- }
- return message;
- }
- private long parseJavbusCodeDocument(Document document, CrawlerLoveFoot crawlerLoveFoot) throws Exception {
- Elements container = document.select("div.container");
- if (container.size() == 0) {
- throw new BusinessException(30000, "番号无效!");
- }
- // 名称
- String h3 = container.select("h3").first().text();
- String[] nameArr = h3.split("\\s+");
- if (nameArr.length > 1) {
- crawlerLoveFoot.setName(h3.substring(nameArr[0].length()).trim());
- } else {
- crawlerLoveFoot.setName(nameArr[0]);
- }
- Elements pEles = container.select("div.info > p");
- // 识别码
- Element pEle = pEles.get(0);
- String iCode = pEle.select("span[style]").first().text();
- crawlerLoveFoot.setIdentificationCode(iCode);
- // 发行日期
- pEle = pEles.get(1);
- String issueDate = pEle.text().split(":")[1].replace("\"", "").trim();
- crawlerLoveFoot.setIssueDate(LocalDate.parse(issueDate, DateUtils.dateFormatter));
- // 长度
- pEle = pEles.get(2);
- String length = pEle.text().split(":")[1].replace("\"", "").trim();
- crawlerLoveFoot.setLength(length);
- // 导演
- Elements directorEles = container.select("div.info").select("p:contains(導演)");
- if (directorEles.size() > 0) {
- pEle = directorEles.first().select("a[href]").first();
- crawlerLoveFoot.setDirector(pEle.text());
- }
- // 制作商
- Elements markerEles = container.select("div.info").select("p:contains(製作商)");
- if (markerEles.size() > 0) {
- pEle = markerEles.first().select("a[href]").first();
- crawlerLoveFoot.setMaker(pEle.text());
- }
- // 发行商
- Elements issuerEles = container.select("div.info").select("p:contains(發行商)");
- if (issuerEles.size() > 0) {
- pEle = issuerEles.first().select("a[href]").first();
- crawlerLoveFoot.setIssuer(pEle.text());
- }
- // 类别
- Elements genresEles = container.select("div.info").select("p:contains(類別)");
- if (genresEles.size() > 0) {
- StringBuffer sb = new StringBuffer();
- Elements ahrefEles = genresEles.first().nextElementSibling().select("a[href]");
- for (Element ahrefEle : ahrefEles) {
- sb.append(ahrefEle.text()).append(",");
- }
- if (sb.length() > 0) {
- sb = sb.deleteCharAt(sb.length() - 1);
- }
- crawlerLoveFoot.setGenres(sb.toString());
- }
- // 演员
- Elements castEles = container.select("div.info").select("p.star-show:contains(演員)");
- if (castEles.size() > 0) {
- Elements castElesTemp = container.select("div.info:contains(暫無出演者資訊)");
- if (castElesTemp.size() == 0) {
- StringBuffer sb = new StringBuffer();
- Elements ahrefEles = castEles.first().nextElementSibling().nextElementSibling().select("a[href]");
- for (Element ahrefEle : ahrefEles) {
- sb.append(ahrefEle.text()).append(",");
- }
- if (sb.length() > 0) {
- sb = sb.deleteCharAt(sb.length() - 1);
- }
- crawlerLoveFoot.setCast(sb.toString());
- }
- }
- // 图片URL
- String href = container.select("a.bigImage").first().attr("abs:href");
- long start = System.currentTimeMillis();
- Connection.Response response = Jsoup.connect(href).method(Connection.Method.GET).ignoreContentType(true).timeout(50 * 1000).execute();
- String fileName = issueDate.concat(" ").concat(h3).replace("/", "_");
- byte[] imgUrlBytes = fileName.getBytes(StandardCharsets.UTF_8);
- if (imgUrlBytes.length > 251) {
- byte[] imgUrlDestBytes = new byte[251];
- System.arraycopy(imgUrlBytes, 0, imgUrlDestBytes, 0, 251);
- fileName = new String(imgUrlDestBytes, StandardCharsets.UTF_8).replace("�", "");
- }
- fileName = fileName.concat(".jpg");
- String machiImgUrl = "足舐/".concat(fileName);
- saveFile(response.bodyStream(), javbusConstantMap.get("apics_path").concat(machiImgUrl));
- long end = System.currentTimeMillis();
- crawlerLoveFoot.setImgUrl(machiImgUrl);
- return end - start;
- }
- /**
- * 保存文件到本地
- *
- * @param bufferedInputStream
- * @param savePath
- */
- private void saveFile(BufferedInputStream bufferedInputStream, String savePath) throws IOException {
- //一次最多读取1k
- byte[] buffer = new byte[1024];
- //实际读取的长度
- int readLenghth;
- //创建的一个写出的缓冲流
- BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(new FileOutputStream(new File(savePath)));
- //文件逐步写入本地
- while ((readLenghth = bufferedInputStream.read(buffer, 0, 1024)) != -1) {//先读出来,保存在buffer数组中
- bufferedOutputStream.write(buffer, 0, readLenghth);//再从buffer中取出来保存到本地
- }
- //关闭缓冲流
- bufferedOutputStream.close();
- bufferedInputStream.close();
- }
- public static void main(String[] args) {
- String s = "リア充反対!彼女の目の前で彼氏を拘束、●す鬼畜痴女";
- String newName = s.substring(s.length() / 2);
- newName = newName.substring(newName.length() / 2);
- System.out.println(newName);
- }
- }
|