| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195 |
- package top.lvzhiqiang.service.impl;
- import lombok.extern.slf4j.Slf4j;
- import org.jsoup.Connection;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- import org.springframework.beans.factory.annotation.Value;
- import org.springframework.scheduling.annotation.Async;
- import org.springframework.stereotype.Service;
- import org.springframework.transaction.annotation.Propagation;
- import org.springframework.transaction.annotation.Transactional;
- import org.springframework.util.StopWatch;
- import top.lvzhiqiang.entity.CrawlerLoveFoot;
- import top.lvzhiqiang.entity.DicCode;
- import top.lvzhiqiang.exception.BusinessException;
- import top.lvzhiqiang.mapper.CrawlerLoveFootMapper;
- import top.lvzhiqiang.mapper.DicCodeMapper;
- import top.lvzhiqiang.mapper.VideoSitePoolMapper;
- import top.lvzhiqiang.service.Crawler4LoveFootService;
- import top.lvzhiqiang.util.DateUtils;
- import top.lvzhiqiang.util.JsoupUtil;
- import top.lvzhiqiang.util.StringUtils;
- import javax.annotation.Resource;
- import java.io.*;
- import java.net.InetSocketAddress;
- import java.net.Proxy;
- import java.net.URLDecoder;
- import java.nio.charset.StandardCharsets;
- import java.time.LocalDate;
- import java.time.LocalDateTime;
- import java.util.HashMap;
- import java.util.List;
- import java.util.Map;
- import java.util.UUID;
- import java.util.stream.Collectors;
- /**
- * Crawler LoveFoot ServiceImpl
- *
- * @author lvzhiqiang
- * 2022/10/17 14:47
- */
- @Service
- @Slf4j
- public class Crawler4LoveFootServiceImpl implements Crawler4LoveFootService {
- @Resource
- private DicCodeMapper dicCodeMapper;
- @Resource
- private CrawlerLoveFootMapper crawlerLoveFootMapper;
- @Resource
- private VideoSitePoolMapper videoSitePoolMapper;
- @Value("${spring.profiles.active}")
- private String env;
- Map<String, String> footConstantMap = null;
- Map<String, String> javbusConstantMap = null;
- Map<String, String> javdbConstantMap = null;
- List<String> javbusUrlList = null;
- Map<String, String> headerMap = new HashMap<>();
- Map<String, String> header2Map = new HashMap<>();
- Map<String, String> header3Map = new HashMap<>();
- Proxy proxy = null;
- public void beforeProxy() {
- if (null == proxy) {
- if ("dev".equals(env)) {
- proxy = new Proxy(Proxy.Type.SOCKS, new InetSocketAddress("127.0.0.1", 1080));
- } else {
- proxy = Proxy.NO_PROXY;
- }
- }
- }
- @Async
- @Override
- @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
- public void jsoupLoveFoot4avnoashi(Integer status, Integer isDel, Integer ignoreRetryCount) throws Exception {
- log.warn("jsoupFoot4avnoashi 开始:status={},isDel={},ignoreRetryCount={}", status, isDel, ignoreRetryCount);
- StopWatch stopWatch = new StopWatch();
- stopWatch.start();
- if (isDel == 1) {
- crawlerLoveFootMapper.deleteAll();
- }
- List<DicCode> dicCodeList = dicCodeMapper.findAll();
- // 获取常量MAP
- footConstantMap = dicCodeList.stream()
- .filter(x -> "foot".equals(x.getCodeDesc()) && x.getEnv().contains(env))
- .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
- javbusConstantMap = dicCodeList.stream()
- .filter(x -> x.getType() != null && 1 == x.getType() && x.getEnv().contains(env))
- .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
- // 获取javbus防屏蔽地址
- javbusUrlList = videoSitePoolMapper.findUrlByTypeAndDeleteFlag(1, 1);
- if (javbusUrlList.size() == 0) {
- log.warn("javbusUrlList为空");
- return;
- }
- // 代理及TOKEN设置
- beforeProxy();
- // 解析原始站点
- jsoupLoveFoot4avnoashiSub(status, ignoreRetryCount);
- log.warn("jsoupFoot4avnoashi 结束:time={}", stopWatch.getTotalTimeSeconds());
- }
- @Async
- @Override
- @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
- public void jsoupLoveFoot4jpfoot(Integer status, Integer isDel, Integer ignoreRetryCount) throws Exception {
- log.warn("jsoupLoveFoot4jpfoot 开始:status={},isDel={},ignoreRetryCount={}", status, isDel, ignoreRetryCount);
- StopWatch stopWatch = new StopWatch();
- stopWatch.start();
- if (isDel == 1) {
- crawlerLoveFootMapper.deleteAll();
- }
- List<DicCode> dicCodeList = dicCodeMapper.findAll();
- // 获取常量MAP
- footConstantMap = dicCodeList.stream()
- .filter(x -> "foot".equals(x.getCodeDesc()) && x.getEnv().contains(env))
- .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
- javbusConstantMap = dicCodeList.stream()
- .filter(x -> x.getType() != null && 1 == x.getType() && x.getEnv().contains(env))
- .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
- javdbConstantMap = dicCodeList.stream()
- .filter(x -> x.getType() != null && 2 == x.getType() && x.getEnv().contains(env))
- .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
- // 获取javbus防屏蔽地址
- javbusUrlList = videoSitePoolMapper.findUrlByTypeAndDeleteFlag(1, 1);
- if (javbusUrlList.size() == 0) {
- log.warn("javbusUrlList为空");
- return;
- }
- // 代理及TOKEN设置
- beforeProxy();
- // 解析原始站点
- jsoupLoveFoot4jpfootSub(status, ignoreRetryCount);
- log.warn("jsoupLoveFoot4jpfoot 结束:time={}", stopWatch.getTotalTimeSeconds());
- }
- @Async
- @Override
- @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
- public void jsoupLoveFoot4CrawingFail(Integer status, Integer ignoreRetryCount, String website, String identificationCode) {
- log.warn("jjsoupLoveFoot4CrawingFail 开始");
- StopWatch stopWatch = new StopWatch();
- stopWatch.start();
- // 获取待抓取码列表
- List<CrawlerLoveFoot> loveFootList;
- if (StringUtils.isNotEmpty(identificationCode)) {
- loveFootList = crawlerLoveFootMapper.findByCodeAndType(identificationCode, null, null);
- } else {
- if (1 == ignoreRetryCount) {
- loveFootList = crawlerLoveFootMapper.findInfoByStatus4IgnoreRetryCount(status);
- } else {
- loveFootList = crawlerLoveFootMapper.findInfoByStatus(status);
- }
- }
- if (loveFootList.size() == 0) {
- log.warn("loveFootList为空");
- return;
- }
- log.warn("jsoupLoveFoot4CrawingFail loveFootList size={}", loveFootList.size());
- List<DicCode> dicCodeList = dicCodeMapper.findAll();
- // 获取常量MAP
- javbusConstantMap = dicCodeList.stream()
- .filter(x -> x.getType() != null && 1 == x.getType() && x.getEnv().contains(env))
- .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
- javdbConstantMap = dicCodeList.stream()
- .filter(x -> x.getType() != null && 2 == x.getType() && x.getEnv().contains(env))
- .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
- // 获取javbus防屏蔽地址
- if ("javbus".equals(website)) {
- javbusUrlList = videoSitePoolMapper.findUrlByTypeAndDeleteFlag(1, 1);
- if (javbusUrlList.size() == 0) {
- log.warn("javbusUrlList为空");
- return;
- }
- }
- // 代理及TOKEN设置
- beforeProxy();
- // 解析原始站点
- int successCount = jsoupLoveFoot4CrawingFailSub(loveFootList, website);
- log.warn("jjsoupLoveFoot4CrawingFail 结束:totalCount={},successCount={},time={}", loveFootList.size(), successCount, stopWatch.getTotalTimeSeconds());
- }
- @Async
- @Override
- @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
- public void jsoupLoveFoot4CrawingFail2(Integer status, Integer ignoreRetryCount, String website, String identificationCode) {
- log.warn("jsoupLoveFoot4CrawingFail2 开始");
- StopWatch stopWatch = new StopWatch();
- stopWatch.start();
- // 获取待抓取码列表
- List<CrawlerLoveFoot> loveFootList = crawlerLoveFootMapper.findByCodeAndType(identificationCode, null, null);
- if (loveFootList.size() == 0) {
- log.warn("loveFootList为空");
- return;
- }
- log.warn("jsoupLoveFoot4CrawingFail2 loveFootList size={}", loveFootList.size());
- List<DicCode> dicCodeList = dicCodeMapper.findAll();
- // 获取常量MAP
- javbusConstantMap = dicCodeList.stream()
- .filter(x -> x.getType() != null && 1 == x.getType() && x.getEnv().contains(env))
- .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
- javdbConstantMap = dicCodeList.stream()
- .filter(x -> x.getType() != null && 2 == x.getType() && x.getEnv().contains(env))
- .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
- // 获取javbus防屏蔽地址
- if ("javbus".equals(website)) {
- javbusUrlList = videoSitePoolMapper.findUrlByTypeAndDeleteFlag(1, 1);
- if (javbusUrlList.size() == 0) {
- log.warn("javbusUrlList为空");
- return;
- }
- }
- // 代理及TOKEN设置
- beforeProxy();
- // 解析原始站点
- // 通过关键词获取识别码
- CrawlerLoveFoot crawlerLoveFoot = loveFootList.get(0);
- try {
- String message = parseKeywordsToCode(crawlerLoveFoot, crawlerLoveFoot.getName(), "javdb");
- if (StringUtils.isNotEmpty(message)) {
- throw new Exception(message);
- }
- crawlerLoveFootMapper.insertOrUpdate4jpfoot(crawlerLoveFoot);
- } catch (Exception e) {
- log.error("jsoupLoveFoot4CrawingFail2 detail fail,sourceUrl={}", crawlerLoveFoot.getOrginJpfootUrl(), e);
- crawlerLoveFoot.setFailureCause(e.getMessage());
- crawlerLoveFootMapper.insertOrUpdate4jpfoot(crawlerLoveFoot);
- }
- log.warn("jsoupLoveFoot4CrawingFail2 结束:totalCount={},time={}", loveFootList.size(), stopWatch.getTotalTimeSeconds());
- }
- @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
- public int jsoupLoveFoot4CrawingFailSub(List<CrawlerLoveFoot> loveFootList, String website) {
- int successCount = 0;
- Document loveFootDetailDocument;
- for (CrawlerLoveFoot crawlerLoveFoot : loveFootList) {
- Document searchDocument = null;
- Document codeDocument;
- String message = null;
- int retryCount = 0;
- if (StringUtils.isEmpty(crawlerLoveFoot.getName()) && StringUtils.isNotEmpty(crawlerLoveFoot.getOrginAvnoashiUrl()) && crawlerLoveFoot.getOrginAvnoashiUrl().contains("avnoashi-1.com")) {
- try {
- headerMap.put("referer", crawlerLoveFoot.getOrginAvnoashiUrl());
- loveFootDetailDocument = JsoupUtil.requestDocument(crawlerLoveFoot.getOrginAvnoashiUrl(), JsoupUtil.HTTP_GET, proxy, null, header2Map, null);
- // 获取关键词
- String keywords = loveFootDetailDocument.select("div.postContents").select("td:contains(タイトル)").next("td").text();
- if (StringUtils.isNotEmpty(keywords)) {
- crawlerLoveFoot.setName(keywords);
- } else {
- crawlerLoveFoot.setName("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab");
- }
- } catch (Exception e) {
- crawlerLoveFoot.setName("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab");
- }
- } else if (StringUtils.isEmpty(crawlerLoveFoot.getName()) && StringUtils.isNotEmpty(crawlerLoveFoot.getOrginJpfootUrl()) && crawlerLoveFoot.getOrginJpfootUrl().contains("jp-foot.net")) {
- try {
- headerMap.put("referer", crawlerLoveFoot.getOrginJpfootUrl());
- loveFootDetailDocument = JsoupUtil.requestDocument(crawlerLoveFoot.getOrginJpfootUrl(), JsoupUtil.HTTP_GET, proxy, null, header2Map, null);
- // 获取关键词
- String keywords = loveFootDetailDocument.select("div.avdetail_detailTop").select("p.avdetail_detailTopTitle").text().trim();
- if (StringUtils.isNotEmpty(keywords)) {
- crawlerLoveFoot.setName(keywords);
- } else {
- crawlerLoveFoot.setName("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
- }
- } catch (Exception e) {
- crawlerLoveFoot.setName("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
- }
- }
- while (retryCount <= 3) {
- long start = System.currentTimeMillis();
- String searchUrl = null;
- Elements itembSelects = null;
- try {
- Thread.sleep(3000);
- String javbusCodeUrl = null;
- // 获取codeUrl
- String codeUrl = null;
- String title;
- if ("javbus".equals(website)) {
- String javbusUrl = javbusUrlList.get((int) (0 + Math.random() * (javbusUrlList.size())));
- searchUrl = javbusUrl.concat("/search/").concat(crawlerLoveFoot.getName()).concat("&parent=ce");
- try {
- searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- } catch (Exception ee) {
- String newName = crawlerLoveFoot.getName().substring(crawlerLoveFoot.getName().length() / 2);
- searchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
- try {
- searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- } catch (Exception eee) {
- newName = newName.substring(newName.length() / 2);
- searchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
- try {
- searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- } catch (Exception eeee) {
- // throw new BusinessException(30000, "javbus search result null");
- }
- }
- }
- if (null == searchDocument) {
- String newName = crawlerLoveFoot.getName().replace("●", "");
- searchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
- try {
- searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- } catch (Exception ee) {
- newName = newName.substring(0, newName.length() / 2);
- searchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
- try {
- searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- } catch (Exception eee) {
- newName = newName.substring(0, newName.length() / 2);
- searchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
- try {
- searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- } catch (Exception eeee) {
- // throw new BusinessException(30000, "javbus search result null");
- }
- }
- }
- }
- if (null == searchDocument) {
- if (crawlerLoveFoot.getIdentificationCode().length() < 32) {
- javbusCodeUrl = javbusUrl.concat("/").concat(crawlerLoveFoot.getIdentificationCode());
- itembSelects = new Elements();
- } else {
- throw new BusinessException(30000, "javbus search result null");
- }
- } else {
- itembSelects = searchDocument.select("div#waterfall").select("div.item");
- if (itembSelects.size() == 0) {
- throw new BusinessException(30000, "javbus search result null");
- }
- if (crawlerLoveFoot.getIdentificationCode().length() < 32) {
- javbusCodeUrl = javbusUrl.concat("/").concat(crawlerLoveFoot.getIdentificationCode());
- }
- }
- } else if ("javdb".equals(website)) {
- crawlerLoveFoot.setName(crawlerLoveFoot.getName().replace("%", "%"));
- searchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(crawlerLoveFoot.getName()).concat("&f=all");
- header3Map.put("referer", searchUrl);
- subsearch:
- {
- searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
- itembSelects = searchDocument.select("div.movie-list").select("div.item");
- if (itembSelects.size() != 0) {
- for (Element itembSelect : itembSelects) {
- title = itembSelect.select("a.box").get(0).attr("title");
- if (title.contains(crawlerLoveFoot.getName())) {
- codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
- break subsearch;
- }
- String newName = crawlerLoveFoot.getName().replace("●", "さ");
- if (title.contains(newName)) {
- codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
- crawlerLoveFoot.setName(newName);
- break subsearch;
- }
- newName = crawlerLoveFoot.getName().replace("●", "這");
- if (title.contains(newName)) {
- codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
- crawlerLoveFoot.setName(newName);
- break subsearch;
- }
- newName = crawlerLoveFoot.getName().replace("○", "〇");
- if (title.contains(newName)) {
- codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
- crawlerLoveFoot.setName(newName);
- break subsearch;
- }
- }
- }
- String newName = crawlerLoveFoot.getName().substring(crawlerLoveFoot.getName().length() / 2);
- searchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(newName).concat("&f=all");
- searchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
- itembSelects = searchDocument.select("div.movie-list").select("div.item");
- if (itembSelects.size() != 0) {
- for (Element itembSelect : itembSelects) {
- title = itembSelect.select("a.box").get(0).attr("title");
- if (title.contains(crawlerLoveFoot.getName())) {
- codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
- break subsearch;
- }
- newName = crawlerLoveFoot.getName().replace("●", "さ");
- if (title.contains(newName)) {
- codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
- crawlerLoveFoot.setName(newName);
- break subsearch;
- }
- newName = crawlerLoveFoot.getName().replace("●", "這");
- if (title.contains(newName)) {
- codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
- crawlerLoveFoot.setName(newName);
- break subsearch;
- }
- newName = crawlerLoveFoot.getName().replace("○", "〇");
- if (title.contains(newName)) {
- codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
- crawlerLoveFoot.setName(newName);
- break subsearch;
- }
- }
- }
- }
- }
- if ("javbus".equals(website)) {
- for (Element itembSelect : itembSelects) {
- title = itembSelect.select("a.movie-box").get(0).select("div.photo-frame > img").attr("title");
- if (title.contains(crawlerLoveFoot.getName())) {
- codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
- break;
- }
- String newName = crawlerLoveFoot.getName().replace("●", "さ");
- if (title.contains(newName)) {
- codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
- crawlerLoveFoot.setName(newName);
- break;
- }
- String[] newNameArr = crawlerLoveFoot.getName().split("●");
- int matchCount = 0;
- for (String s : newNameArr) {
- if (title.contains(s)) {
- matchCount++;
- }
- }
- if (newNameArr.length == matchCount) {
- codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
- crawlerLoveFoot.setName(title);
- break;
- }
- }
- if (StringUtils.isEmpty(codeUrl) && StringUtils.isNotEmpty(javbusCodeUrl)) {
- codeUrl = javbusCodeUrl;
- crawlerLoveFoot.setChangeTitleFlag(1);
- }
- if (StringUtils.isEmpty(codeUrl)) {
- throw new BusinessException(30000, "javbus search result mismatch");
- }
- } else if ("javdb".equals(website)) {
- if (StringUtils.isEmpty(codeUrl)) {
- throw new BusinessException(30000, "javdb search result mismatch");
- }
- }
- // 解析codeUrl
- long picTime = 999;
- if ("javbus".equals(website)) {
- codeDocument = JsoupUtil.requestDocument(codeUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- picTime = parseJavbusCodeDocument(codeDocument, crawlerLoveFoot);
- } else if ("javdb".equals(website)) {
- codeDocument = JsoupUtil.requestDocument(codeUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
- picTime = parseJavdbCodeDocument(codeDocument, crawlerLoveFoot);
- crawlerLoveFoot.setJavdbUrl(codeUrl);
- }
- crawlerLoveFoot.setRetryCount(retryCount);
- crawlerLoveFoot.setType(2);
- crawlerLoveFoot.setStatus(3);
- log.warn("jsoupLoveFoot4CrawingFailSub parseKeywordsToCode success,num={},keywords={},code={},picTime={},time={}", successCount, crawlerLoveFoot.getName(), crawlerLoveFoot.getIdentificationCode(), picTime, System.currentTimeMillis() - start);
- break;
- } catch (Exception e) {
- ++retryCount;
- if (retryCount < 4) {
- log.error("jsoupLoveFoot4CrawingFailSub error重试:,retryCount={},time={},javdbSearchUrl={}", retryCount, System.currentTimeMillis() - start, searchUrl, e);
- } else if (retryCount == 4) {
- message = e.getMessage().length() <= 200 ? e.getMessage() : e.getMessage().substring(0, 200);
- }
- if (e instanceof BusinessException) {
- message = e.getMessage().length() <= 200 ? e.getMessage() : e.getMessage().substring(0, 200);
- break;
- }
- }
- }
- if (StringUtils.isNotEmpty(message)) {
- CrawlerLoveFoot crawlerLoveFoot2 = new CrawlerLoveFoot();
- crawlerLoveFoot2.setId(crawlerLoveFoot.getId());
- crawlerLoveFoot2.setFailureCause(message);
- crawlerLoveFoot2.setRetryCount(retryCount);
- crawlerLoveFootMapper.updateInfoById(crawlerLoveFoot2);
- } else {
- crawlerLoveFoot.setFailureCause("");
- Integer exist = crawlerLoveFootMapper.existLoveFootByCode(crawlerLoveFoot.getIdentificationCode());
- if (exist != null) {
- crawlerLoveFootMapper.updateInfoByCode(crawlerLoveFoot);
- } else {
- crawlerLoveFootMapper.updateInfoById(crawlerLoveFoot);
- }
- successCount++;
- }
- }
- return successCount;
- }
- private long parseJavdbCodeDocument(Document javdbCodeDocument, CrawlerLoveFoot crawlerLoveFoot) throws IOException {
- Elements container = javdbCodeDocument.select("section.section > div.container");
- if (container.size() == 0) {
- throw new BusinessException(30000, "番号无效!");
- }
- Elements videoDetail = container.select("div.video-detail");
- // 名称
- // crawlerLoveFoot.setName(videoDetail.select("h2.title").select("strong.current-title").text().trim());
- Elements moviePanelInfos = videoDetail.select("nav.movie-panel-info");
- Element pEle = moviePanelInfos.get(0);
- // 识别码
- String iCode = pEle.select("div:contains(番號)").select("span.value").first().text().replace(" ", "").replaceAll("\\s+", "");
- crawlerLoveFoot.setIdentificationCode(iCode);
- // 发行日期
- String issueDate = pEle.select("div:contains(日期)").select("span.value").first().text().replace(" ", "").replaceAll("\\s+", "");
- crawlerLoveFoot.setIssueDate(LocalDate.parse(issueDate, DateUtils.dateFormatter));
- // 长度
- String length = pEle.select("div:contains(時長)").select("span.value").first().text().replace(" ", "").replaceAll("\\s+", "");
- crawlerLoveFoot.setLength(length);
- // 导演
- Elements directorEles = pEle.select("div:contains(導演)").select("span.value");
- if (directorEles.size() > 0) {
- crawlerLoveFoot.setDirector(directorEles.first().select("a[href]").first().text().replace(" ", "").replaceAll("\\s+", ""));
- }
- // 制作商/片商
- Elements markerEles = pEle.select("div:contains(片商)").select("span.value");
- if (markerEles.size() > 0) {
- crawlerLoveFoot.setMaker(markerEles.first().select("a[href]").first().text().replace(" ", "").replaceAll("\\s+", ""));
- }
- // 发行商
- Elements issuerEles = pEle.select("div:contains(發行)").select("span.value");
- if (issuerEles.size() > 0) {
- crawlerLoveFoot.setIssuer(issuerEles.first().select("a[href]").first().text().replace(" ", "").replaceAll("\\s+", ""));
- }
- // 类别
- Elements genresEles = pEle.select("div:contains(類別)").select("span.value");
- if (genresEles.size() > 0) {
- StringBuffer sb = new StringBuffer();
- Elements ahrefEles = genresEles.first().select("a[href]");
- for (Element ahrefEle : ahrefEles) {
- sb.append(ahrefEle.text().replace(" ", "").replaceAll("\\s+", "")).append(",");
- }
- if (sb.length() > 0) {
- sb = sb.deleteCharAt(sb.length() - 1);
- }
- crawlerLoveFoot.setGenres(sb.toString());
- }
- // 演员
- Elements castEles = pEle.select("div:contains(演員)").select("span.value");
- if (castEles.size() > 0) {
- StringBuffer sb = new StringBuffer();
- Elements ahrefEles = castEles.first().select("a[href]");
- for (Element ahrefEle : ahrefEles) {
- sb.append(ahrefEle.text().replace(" ", "").replaceAll("\\s+", "")).append(",");
- }
- if (sb.length() > 0) {
- sb = sb.deleteCharAt(sb.length() - 1);
- }
- crawlerLoveFoot.setCast(sb.toString());
- }
- // 图片URL
- Elements videoMetaPanel = videoDetail.select("div.column-video-cover");
- String href = videoMetaPanel.select("a > img").first().attr("src");
- long start = System.currentTimeMillis();
- Connection.Response response = Jsoup.connect(href).method(Connection.Method.GET).ignoreContentType(true).timeout(50 * 1000).execute();
- String fileName = issueDate.concat(" ").concat(iCode).concat(" ").concat(StringUtils.escapeJavParam(crawlerLoveFoot.getName()));
- byte[] imgUrlBytes = fileName.getBytes(StandardCharsets.UTF_8);
- if (imgUrlBytes.length > 251) {
- byte[] imgUrlDestBytes = new byte[251];
- System.arraycopy(imgUrlBytes, 0, imgUrlDestBytes, 0, 251);
- fileName = new String(imgUrlDestBytes, StandardCharsets.UTF_8).replace("�", "");
- }
- fileName = fileName.concat(".jpg");
- String machiImgUrl = "足舐/".concat(fileName);
- saveFile(response.bodyStream(), javbusConstantMap.get("apics_path").concat(machiImgUrl));
- long end = System.currentTimeMillis();
- crawlerLoveFoot.setImgUrl(machiImgUrl);
- return end - start;
- }
- @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
- public void jsoupLoveFoot4avnoashiSub(Integer status, Integer ignoreRetryCount) throws Exception {
- CrawlerLoveFoot latestLoveFoot = crawlerLoveFootMapper.findLatestInfo4avnoashi();
- LocalDate latestDate;
- if (latestLoveFoot == null) {
- latestDate = LocalDate.of(1970, 1, 1);
- } else {
- latestDate = latestLoveFoot.getUpdateDate();
- }
- List<DicCode> dicCodeList = dicCodeMapper.findAll();
- javdbConstantMap = dicCodeList.stream()
- .filter(x -> x.getType() != null && 2 == x.getType() && x.getEnv().contains(env))
- .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
- String avnoashiUrl = footConstantMap.get("avnoashi_url");
- headerMap.put("referer", avnoashiUrl);
- header2Map.put("referer", avnoashiUrl.concat("?sort=newer"));
- Document loveFootDocument;
- Document loveFootDetailDocument;
- boolean tiaoguoFlag = true;
- int pageNum = 0;
- while (true) {
- loveFootDocument = JsoupUtil.requestDocument(avnoashiUrl, JsoupUtil.HTTP_GET, proxy, null, headerMap, null);
- pageNum++;
- if (pageNum > 50) {
- break;
- }
- log.warn("jsoupLoveFoot4avnoashiSub page success:url={}", avnoashiUrl);
- Elements sourceSelects = loveFootDocument.select("div.dividerBottom > div.archive").select("div.archive__contents").select("h2");
- for (Element sourceSelect : sourceSelects) {
- String sourceUrl = sourceSelect.select("a").attr("abs:href");
- if (tiaoguoFlag) {
- tiaoguoFlag = false;
- continue;
- }
- Integer statusInt = 2;
- Integer typeInt = 1;
- LocalDate clockDate = null;
- LocalDate updateDate = null;
- String keywords = null;
- try {
- loveFootDetailDocument = JsoupUtil.requestDocument(sourceUrl, JsoupUtil.HTTP_GET, proxy, null, header2Map, null);
- /*String clockDateStr = loveFootDetailDocument.select("div.viral").select("li.icon-clock").text();
- String updateDateStr = loveFootDetailDocument.select("div.viral").select("li.icon-update").text();
- clockDate = LocalDate.parse(clockDateStr, DateUtils.dateFormatter3);
- updateDate = LocalDate.parse(updateDateStr, DateUtils.dateFormatter3);
- if (updateDate.isBefore(latestDate) || updateDate.isEqual(latestDate)) {
- break outer;
- }*/
- // 获取关键词
- keywords = loveFootDetailDocument.select("div.postContents").select("td:contains(タイトル)").next("td").text();
- if (StringUtils.isNotEmpty(keywords)) {
- statusInt = 1;
- log.warn("jsoupLoveFoot4avnoashiSub parseDetailToKeywords success,sourceUrl={},keywords={}", sourceUrl, keywords);
- } else {
- throw new Exception("keywords is null");
- }
- // 通过关键词获取识别码
- CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
- crawlerLoveFoot.setClockDate(clockDate);
- crawlerLoveFoot.setUpdateDate(updateDate);
- crawlerLoveFoot.setOrginAvnoashiUrl(sourceUrl);
- crawlerLoveFoot.setType(2);
- crawlerLoveFoot.setStatus(3);
- crawlerLoveFoot.setCreateTime(LocalDateTime.now());
- crawlerLoveFoot.setName(keywords);
- Thread.sleep(3000);
- String message = parseKeywordsToCode(crawlerLoveFoot, keywords, "javdb");
- if (StringUtils.isNotEmpty(message)) {
- statusInt = 4;
- throw new Exception(message);
- }
- CrawlerLoveFoot exist = crawlerLoveFootMapper.findLoveFootByOrginAvnoashiUrl(sourceUrl);
- if (exist == null || exist.getStatus() != 3) {
- crawlerLoveFootMapper.insertOrUpdate4avnoashi(crawlerLoveFoot);
- }
- } catch (Exception e) {
- Integer exist = crawlerLoveFootMapper.existLoveFootByOrginAvnoashiUrl(sourceUrl);
- if (exist == null) {
- log.error("jsoupLoveFoot4avnoashiSub detail fail,sourceUrl={}", sourceUrl, e);
- CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
- crawlerLoveFoot.setIdentificationCode(UUID.randomUUID().toString());
- crawlerLoveFoot.setOrginAvnoashiUrl(sourceUrl);
- crawlerLoveFoot.setClockDate(clockDate);
- crawlerLoveFoot.setUpdateDate(updateDate);
- crawlerLoveFoot.setName(keywords);
- crawlerLoveFoot.setType(typeInt);
- crawlerLoveFoot.setStatus(statusInt);
- crawlerLoveFoot.setCreateTime(LocalDateTime.now());
- crawlerLoveFoot.setFailureCause(e.getMessage());
- crawlerLoveFootMapper.insertOrUpdate4avnoashi(crawlerLoveFoot);
- }
- }
- }
- // 继续下一页
- Elements nextSelects = loveFootDocument.select("ul.pager").select("a:contains(Next)");
- if (nextSelects.size() > 0) {
- avnoashiUrl = nextSelects.get(0).attr("abs:href");
- } else {
- break;
- }
- }
- }
- @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
- public void jsoupLoveFoot4jpfootSub(Integer status, Integer ignoreRetryCount) throws Exception {
- CrawlerLoveFoot latestLoveFoot = crawlerLoveFootMapper.findLatestInfo4jpfoot();
- LocalDate latestDate;
- if (latestLoveFoot == null) {
- latestDate = LocalDate.of(1970, 1, 1);
- } else {
- latestDate = latestLoveFoot.getUpdateDate();
- }
- String jpfootUrl = footConstantMap.get("jpfoot_url");
- headerMap.put("referer", jpfootUrl);
- Document loveFootDocument;
- Document loveFootDetailDocument;
- outer:
- while (true) {
- loveFootDocument = JsoupUtil.requestDocument(jpfootUrl, JsoupUtil.HTTP_GET, proxy, null, headerMap, null);
- log.warn("jsoupLoveFoot4jpfootSub page success:url={}", jpfootUrl);
- Elements sourceSelects = loveFootDocument.select("article.mainContainer > div.av_itemGrid").select("article.av_item");
- for (Element sourceSelect : sourceSelects) {
- Thread.sleep(1000L);
- String sourceUrl = sourceSelect.select("a.av_itemLink").attr("abs:href");
- sourceUrl = URLDecoder.decode(sourceUrl, "UTF-8");
- Integer statusInt = 2;
- Integer typeInt = 1;
- LocalDate clockDate = null;
- LocalDate updateDate = null;
- String keywords = null;
- try {
- loveFootDetailDocument = JsoupUtil.requestDocument(sourceUrl, JsoupUtil.HTTP_GET, proxy, null, headerMap, null);
- String dateStr = loveFootDetailDocument.select("div.avdetail_date").select("span.avdetail_dateText").text();
- clockDate = LocalDate.parse(dateStr, DateUtils.dateFormatter4);
- updateDate = clockDate;
- if (updateDate.isBefore(latestDate) || updateDate.isEqual(latestDate)) {
- break outer;
- }
- // 获取关键词
- keywords = loveFootDetailDocument.select("div.avdetail_detailTop").select("p.avdetail_detailTopTitle").text().trim();
- if (StringUtils.isNotEmpty(keywords)) {
- statusInt = 1;
- log.warn("jsoupLoveFoot4jpfootSub parseDetailToKeywords success,sourceUrl={},keywords={}", sourceUrl, keywords);
- } else {
- throw new Exception("keywords is null");
- }
- // 通过关键词获取识别码
- CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
- crawlerLoveFoot.setClockDate(clockDate);
- crawlerLoveFoot.setUpdateDate(updateDate);
- crawlerLoveFoot.setOrginJpfootUrl(sourceUrl);
- crawlerLoveFoot.setType(2);
- crawlerLoveFoot.setStatus(3);
- crawlerLoveFoot.setCreateTime(LocalDateTime.now());
- crawlerLoveFoot.setName(keywords);
- Thread.sleep(3000);
- String message = parseKeywordsToCode(crawlerLoveFoot, keywords, "javdb");
- if (StringUtils.isNotEmpty(message)) {
- statusInt = 4;
- throw new Exception(message);
- }
- crawlerLoveFootMapper.insertOrUpdate4jpfoot(crawlerLoveFoot);
- } catch (Exception e) {
- log.error("jsoupLoveFoot4jpfootSub detail fail,sourceUrl={}", sourceUrl, e);
- CrawlerLoveFoot crawlerLoveFoot = new CrawlerLoveFoot();
- crawlerLoveFoot.setIdentificationCode(UUID.randomUUID().toString());
- crawlerLoveFoot.setOrginJpfootUrl(sourceUrl);
- crawlerLoveFoot.setClockDate(clockDate);
- crawlerLoveFoot.setUpdateDate(updateDate);
- crawlerLoveFoot.setName(keywords);
- crawlerLoveFoot.setType(typeInt);
- crawlerLoveFoot.setStatus(statusInt);
- crawlerLoveFoot.setCreateTime(LocalDateTime.now());
- crawlerLoveFoot.setFailureCause(e.getMessage());
- crawlerLoveFootMapper.insertOrUpdate4jpfoot(crawlerLoveFoot);
- }
- }
- // 继续下一页
- Elements nextSelects = loveFootDocument.select("nav.pagination > div.nav-links").select("a.next");
- if (nextSelects.size() > 0) {
- jpfootUrl = nextSelects.get(0).attr("abs:href");
- } else {
- break;
- }
- }
- }
- private String parseKeywordsToCode(CrawlerLoveFoot crawlerLoveFoot, String keywords, String website) {
- int retryCount = 0;
- Document javbusSearchDocument = null;
- Document javbusCodeDocument;
- Document javdbSearchDocument;
- Document javdbCodeDocument;
- String message = null;
- while (retryCount <= 3) {
- long start = System.currentTimeMillis();
- Elements itembSelects = null;
- try {
- Thread.sleep(3000);
- // 获取codeUrl
- String codeUrl = null;
- String title;
- if ("javbus".equals(website)) {
- String javbusUrl = javbusUrlList.get((int) (0 + Math.random() * (javbusUrlList.size())));
- String javbusSearchUrl = javbusUrl.concat("/search/").concat(keywords).concat("&parent=ce");
- try {
- javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- } catch (Exception ee) {
- String newName = keywords.substring(keywords.length() / 2);
- javbusSearchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
- try {
- javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- } catch (Exception eee) {
- newName = newName.substring(newName.length() / 2);
- javbusSearchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
- try {
- javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- } catch (Exception eeee) {
- // throw new BusinessException(30000, "javbus search result null");
- }
- }
- }
- if (null == javbusSearchDocument) {
- String newName = keywords.replace("●", "");
- javbusSearchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
- try {
- javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- } catch (Exception ee) {
- newName = newName.substring(0, newName.length() / 2);
- javbusSearchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
- try {
- javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- } catch (Exception eee) {
- newName = newName.substring(0, newName.length() / 2);
- javbusSearchUrl = javbusUrl.concat("/search/").concat(newName).concat("&parent=ce");
- try {
- javbusSearchDocument = JsoupUtil.requestDocument(javbusSearchUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- } catch (Exception eeee) {
- throw new BusinessException(30000, "javbus search result null");
- }
- }
- }
- }
- itembSelects = javbusSearchDocument.select("div#waterfall").select("div.item");
- if (itembSelects.size() == 0) {
- throw new BusinessException(30000, "javbus search result null");
- }
- } else if ("javdb".equals(website)) {
- crawlerLoveFoot.setName(crawlerLoveFoot.getName().replace("%", "%").replace("#", "#").replace("?", "?"));
- String searchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(crawlerLoveFoot.getName()).concat("&f=all");
- header3Map.put("referer", searchUrl);
- subsearch:
- {
- javdbSearchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
- itembSelects = javdbSearchDocument.select("div.movie-list").select("div.item");
- if (itembSelects.size() != 0) {
- for (Element itembSelect : itembSelects) {
- title = itembSelect.select("a.box").get(0).attr("title");
- if (title.contains(crawlerLoveFoot.getName())) {
- codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
- break subsearch;
- }
- String newName = crawlerLoveFoot.getName().replace("●", "さ");
- if (title.contains(newName)) {
- codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
- crawlerLoveFoot.setName(newName);
- break subsearch;
- }
- newName = crawlerLoveFoot.getName().replace("●", "這");
- if (title.contains(newName)) {
- codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
- crawlerLoveFoot.setName(newName);
- break subsearch;
- }
- newName = crawlerLoveFoot.getName().replace("○", "〇");
- if (title.contains(newName)) {
- codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
- crawlerLoveFoot.setName(newName);
- break subsearch;
- }
- }
- }
- String newName = crawlerLoveFoot.getName().substring(crawlerLoveFoot.getName().length() / 2);
- searchUrl = javdbConstantMap.get("javdb").concat("search?q=").concat(newName).concat("&f=all");
- javdbSearchDocument = JsoupUtil.requestDocument(searchUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
- itembSelects = javdbSearchDocument.select("div.movie-list").select("div.item");
- if (itembSelects.size() != 0) {
- for (Element itembSelect : itembSelects) {
- title = itembSelect.select("a.box").get(0).attr("title");
- if (title.contains(crawlerLoveFoot.getName())) {
- codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
- break subsearch;
- }
- newName = crawlerLoveFoot.getName().replace("●", "さ");
- if (title.contains(newName)) {
- codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
- crawlerLoveFoot.setName(newName);
- break subsearch;
- }
- newName = crawlerLoveFoot.getName().replace("●", "這");
- if (title.contains(newName)) {
- codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
- crawlerLoveFoot.setName(newName);
- break subsearch;
- }
- newName = crawlerLoveFoot.getName().replace("○", "〇");
- if (title.contains(newName)) {
- codeUrl = itembSelect.select("a.box").get(0).attr("abs:href");
- crawlerLoveFoot.setName(newName);
- break subsearch;
- }
- }
- }
- }
- }
- if ("javbus".equals(website)) {
- for (Element itembSelect : itembSelects) {
- title = itembSelect.select("a.movie-box").get(0).select("div.photo-frame > img").attr("title");
- if (title.contains(keywords)) {
- codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
- break;
- }
- String newName = keywords.replace("●", "さ");
- if (title.contains(newName)) {
- codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
- crawlerLoveFoot.setName(newName);
- break;
- }
- String[] newNameArr = keywords.split("●");
- int matchCount = 0;
- for (String s : newNameArr) {
- if (title.contains(s)) {
- matchCount++;
- }
- }
- if (newNameArr.length == matchCount) {
- codeUrl = itembSelect.select("a.movie-box").get(0).attr("abs:href");
- crawlerLoveFoot.setName(title);
- break;
- }
- }
- if (StringUtils.isEmpty(codeUrl)) {
- throw new BusinessException(30000, "javbus search result mismatch");
- }
- } else if ("javdb".equals(website)) {
- if (StringUtils.isEmpty(codeUrl)) {
- throw new BusinessException(30000, "javdb search result mismatch");
- }
- }
- // 解析codeUrl
- long picTime = 999;
- if ("javbus".equals(website)) {
- javbusCodeDocument = JsoupUtil.requestDocument(codeUrl, JsoupUtil.HTTP_GET, proxy, null, null, null);
- picTime = parseJavbusCodeDocument(javbusCodeDocument, crawlerLoveFoot);
- } else if ("javdb".equals(website)) {
- javdbCodeDocument = JsoupUtil.requestDocument(codeUrl, JsoupUtil.HTTP_GET, proxy, null, header3Map, null);
- picTime = parseJavdbCodeDocument(javdbCodeDocument, crawlerLoveFoot);
- crawlerLoveFoot.setJavdbUrl(codeUrl);
- }
- crawlerLoveFoot.setRetryCount(retryCount);
- log.warn("jsoupLoveFoot4avnoashiSub parseKeywordsToCode success,keywords={},code={},picTime={},time={}", keywords, crawlerLoveFoot.getIdentificationCode(), picTime, System.currentTimeMillis() - start);
- break;
- } catch (Exception e) {
- ++retryCount;
- if (retryCount < 4) {
- log.error("javbusSearch error重试:,retryCount={},time={},keywords={}", retryCount, System.currentTimeMillis() - start, keywords, e);
- } else if (retryCount == 4) {
- message = e.getMessage().length() <= 200 ? e.getMessage() : e.getMessage().substring(0, 200);
- }
- if (e instanceof BusinessException) {
- message = e.getMessage().length() <= 200 ? e.getMessage() : e.getMessage().substring(0, 200);
- break;
- }
- }
- }
- return message;
- }
- private long parseJavbusCodeDocument(Document document, CrawlerLoveFoot crawlerLoveFoot) throws Exception {
- Elements container = document.select("div.container");
- if (container.size() == 0) {
- throw new BusinessException(30000, "番号无效!");
- }
- // 名称
- String h3 = container.select("h3").first().text();
- String[] nameArr = h3.split("\\s+");
- if (nameArr.length > 1) {
- crawlerLoveFoot.setName(h3.substring(nameArr[0].length()).trim());
- } else {
- crawlerLoveFoot.setName(nameArr[0]);
- }
- Elements pEles = container.select("div.info > p");
- // 识别码
- Element pEle = pEles.get(0);
- String iCode = pEle.select("span[style]").first().text();
- crawlerLoveFoot.setIdentificationCode(iCode);
- // 发行日期
- pEle = pEles.get(1);
- String issueDate = pEle.text().split(":")[1].replace("\"", "").trim();
- crawlerLoveFoot.setIssueDate(LocalDate.parse(issueDate, DateUtils.dateFormatter));
- // 长度
- pEle = pEles.get(2);
- String length = pEle.text().split(":")[1].replace("\"", "").trim();
- crawlerLoveFoot.setLength(length);
- // 导演
- Elements directorEles = container.select("div.info").select("p:contains(導演)");
- if (directorEles.size() > 0) {
- pEle = directorEles.first().select("a[href]").first();
- crawlerLoveFoot.setDirector(pEle.text());
- }
- // 制作商
- Elements markerEles = container.select("div.info").select("p:contains(製作商)");
- if (markerEles.size() > 0) {
- pEle = markerEles.first().select("a[href]").first();
- crawlerLoveFoot.setMaker(pEle.text());
- }
- // 发行商
- Elements issuerEles = container.select("div.info").select("p:contains(發行商)");
- if (issuerEles.size() > 0) {
- pEle = issuerEles.first().select("a[href]").first();
- crawlerLoveFoot.setIssuer(pEle.text());
- }
- // 类别
- Elements genresEles = container.select("div.info").select("p:contains(類別)");
- if (genresEles.size() > 0) {
- StringBuffer sb = new StringBuffer();
- Elements ahrefEles = genresEles.first().nextElementSibling().select("a[href]");
- for (Element ahrefEle : ahrefEles) {
- sb.append(ahrefEle.text()).append(",");
- }
- if (sb.length() > 0) {
- sb = sb.deleteCharAt(sb.length() - 1);
- }
- crawlerLoveFoot.setGenres(sb.toString());
- }
- // 演员
- Elements castEles = container.select("div.info").select("p.star-show:contains(演員)");
- if (castEles.size() > 0) {
- Elements castElesTemp = container.select("div.info:contains(暫無出演者資訊)");
- if (castElesTemp.size() == 0) {
- StringBuffer sb = new StringBuffer();
- Elements ahrefEles = castEles.first().nextElementSibling().nextElementSibling().select("a[href]");
- for (Element ahrefEle : ahrefEles) {
- sb.append(ahrefEle.text()).append(",");
- }
- if (sb.length() > 0) {
- sb = sb.deleteCharAt(sb.length() - 1);
- }
- crawlerLoveFoot.setCast(sb.toString());
- }
- }
- // 图片URL
- String href = container.select("a.bigImage").first().attr("abs:href");
- long start = System.currentTimeMillis();
- Connection.Response response = Jsoup.connect(href).method(Connection.Method.GET).ignoreContentType(true).timeout(50 * 1000).execute();
- String fileName = issueDate.concat(" ").concat(h3).replace("/", "_").replace("[email protected]", "");
- byte[] imgUrlBytes = fileName.getBytes(StandardCharsets.UTF_8);
- if (imgUrlBytes.length > 251) {
- byte[] imgUrlDestBytes = new byte[251];
- System.arraycopy(imgUrlBytes, 0, imgUrlDestBytes, 0, 251);
- fileName = new String(imgUrlDestBytes, StandardCharsets.UTF_8).replace("�", "");
- }
- fileName = fileName.concat(".jpg");
- String machiImgUrl = "足舐/".concat(fileName);
- saveFile(response.bodyStream(), javbusConstantMap.get("apics_path").concat(machiImgUrl));
- long end = System.currentTimeMillis();
- crawlerLoveFoot.setImgUrl(machiImgUrl);
- return end - start;
- }
- /**
- * 保存文件到本地
- *
- * @param bufferedInputStream
- * @param savePath
- */
- private void saveFile(BufferedInputStream bufferedInputStream, String savePath) throws IOException {
- //一次最多读取1k
- byte[] buffer = new byte[1024];
- //实际读取的长度
- int readLenghth;
- //创建的一个写出的缓冲流
- BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(new FileOutputStream(new File(savePath)));
- //文件逐步写入本地
- while ((readLenghth = bufferedInputStream.read(buffer, 0, 1024)) != -1) {//先读出来,保存在buffer数组中
- bufferedOutputStream.write(buffer, 0, readLenghth);//再从buffer中取出来保存到本地
- }
- //关闭缓冲流
- bufferedOutputStream.close();
- bufferedInputStream.close();
- }
- public static void main(String[] args) throws UnsupportedEncodingException {
- String s = "リア充反対!彼女の目の前で彼氏を拘束、●す鬼畜痴女";
- String newName = s.substring(s.length() / 2);
- newName = newName.substring(newName.length() / 2);
- System.out.println(newName);
- String s1 = "https://jp-foot.net/av/%e8%bb%9f%e4%bd%93%e5%a5%b3%e5%ad%90%e5%a4%a7%e7%94%9f%e9%bb%92%e5%b7%9d%e3%81%99%e3%81%bf%e3%82%8c%ef%bc%88%e7%a8%b2%e5%b7%9d%e3%81%aa%e3%81%a4%e3%82%81%ef%bc%89%e3%81%ae%e3%82%aa%e3%83%8a%e3%83%8b/";
- System.out.println(s1.length());
- String decode = URLDecoder.decode(s1, "UTF-8");
- System.out.println(decode);
- }
- }
|