|
|
@@ -52,25 +52,16 @@ public class Crawler4JavbusServiceImpl implements Crawler4JavbusService {
|
|
|
|
|
|
Map<String, String> javbusConstantMap = null;
|
|
|
Map<String, String> javbusCookiesMap = null;
|
|
|
- private String bdAccessToken = "";
|
|
|
+ String bdAccessToken = "";
|
|
|
+ Proxy proxy = null;
|
|
|
|
|
|
- @Async
|
|
|
- @Override
|
|
|
- public void jsoupJavbusProfile(Long start, Integer limit) throws Exception {
|
|
|
- log.warn("jsoupJavbusProfile 开始:start={},limit={}", start, limit);
|
|
|
- StopWatch stopWatch = new StopWatch();
|
|
|
- stopWatch.start();
|
|
|
-
|
|
|
- // 获取javbus常量MAP
|
|
|
- javbusConstantMap = dicCodeMapper.findAll().stream()
|
|
|
- .filter(x -> "javbus".equals(x.getCodeDesc()) && x.getEnv().contains(env))
|
|
|
- .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
|
|
|
- // 代理及TOKEN设置
|
|
|
- Proxy proxy;
|
|
|
- if ("dev".equals(env)) {
|
|
|
- proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("127.0.0.1", 1080));
|
|
|
- } else {
|
|
|
- proxy = Proxy.NO_PROXY;
|
|
|
+ public void beforeJavbus() throws Exception {
|
|
|
+ if (null == proxy) {
|
|
|
+ if ("dev".equals(env)) {
|
|
|
+ proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("127.0.0.1", 1080));
|
|
|
+ } else {
|
|
|
+ proxy = Proxy.NO_PROXY;
|
|
|
+ }
|
|
|
}
|
|
|
if (StringUtils.isEmpty(bdAccessToken)) {
|
|
|
bdAccessToken = getAuth(javbusConstantMap.get("bd_ak"), javbusConstantMap.get("bd_sk"));
|
|
|
@@ -87,11 +78,82 @@ public class Crawler4JavbusServiceImpl implements Crawler4JavbusService {
|
|
|
throw new Exception("获取javbusCookies失败!");
|
|
|
}
|
|
|
}
|
|
|
+ }
|
|
|
+
|
|
|
+ @Async
|
|
|
+ @Override
|
|
|
+ public void jsoupJavbusProfile(Long start, Integer limit) throws Exception {
|
|
|
+ log.warn("jsoupJavbusProfile 开始:start={},limit={}", start, limit);
|
|
|
+ StopWatch stopWatch = new StopWatch();
|
|
|
+ stopWatch.start();
|
|
|
+
|
|
|
+ // 获取javbus常量MAP
|
|
|
+ javbusConstantMap = dicCodeMapper.findAll().stream()
|
|
|
+ .filter(x -> "javbus".equals(x.getCodeDesc()) && x.getEnv().contains(env))
|
|
|
+ .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
|
|
|
+ // 代理及TOKEN设置
|
|
|
+ beforeJavbus();
|
|
|
// 获取个人资料
|
|
|
jsoupJavbusProfileSub(proxy, start, limit);
|
|
|
log.warn("jsoupJavbusProfile 结束:time={}", stopWatch.getTotalTimeSeconds());
|
|
|
}
|
|
|
|
|
|
+ @Async
|
|
|
+ @Override
|
|
|
+ public void handleJavbusLog(Integer status) throws Exception {
|
|
|
+ log.warn("handleJavbusLog 开始:status={}", status);
|
|
|
+ StopWatch stopWatch = new StopWatch();
|
|
|
+ stopWatch.start();
|
|
|
+
|
|
|
+ // 获取javbus常量MAP
|
|
|
+ javbusConstantMap = dicCodeMapper.findAll().stream()
|
|
|
+ .filter(x -> "javbus".equals(x.getCodeDesc()) && x.getEnv().contains(env))
|
|
|
+ .collect(Collectors.toMap(DicCode::getCodeKey, DicCode::getCodeValue, (key1, key2) -> key1));
|
|
|
+ // 代理及TOKEN设置
|
|
|
+ beforeJavbus();
|
|
|
+ // 获取个人资料
|
|
|
+ handleJavbusLogSub(status);
|
|
|
+ log.warn("handleJavbusLog 结束:time={}", stopWatch.getTotalTimeSeconds());
|
|
|
+ }
|
|
|
+
|
|
|
+ private void handleJavbusLogSub(Integer status) {
|
|
|
+ List<CrawlerJavbusLog> javbusLogByStatus = crawlerJavbusProfileMapper.findJavbusLogByStatus(status);
|
|
|
+
|
|
|
+ String profileUrl = "https://www.javbus.com/forum/?";
|
|
|
+ Document profileDocument;
|
|
|
+ for (CrawlerJavbusLog javbusLog : javbusLogByStatus) {
|
|
|
+ String uid = "";
|
|
|
+ String nickName = "";
|
|
|
+ try {
|
|
|
+ profileDocument = JsoupUtil.requestDocument(profileUrl.concat(javbusLog.getBusinessKey()), JsoupUtil.HTTP_GET, proxy, javbusCookiesMap, null, null);
|
|
|
+ if (profileDocument.html().contains("您指定的用戶空間不存在")) {
|
|
|
+ log.warn("jsoupJavbusProfileSub您指定的用戶空間不存在,start={}", javbusLog.getBusinessKey());
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ String[] mbn0Arr = profileDocument.select("div.u_profile").select("div.cl").get(0).select("h2.mbn").get(0).text().replace("(", "").replace(")", "").split("UID:");
|
|
|
+ nickName = mbn0Arr[0].trim();
|
|
|
+ uid = mbn0Arr[1].trim();
|
|
|
+
|
|
|
+ CrawlerJavbusProfile crawlerJavbusProfile = new CrawlerJavbusProfile();
|
|
|
+ crawlerJavbusProfile.setUid(Long.valueOf(uid));
|
|
|
+ crawlerJavbusProfile.setNickName(nickName);
|
|
|
+ parseJavbusProfile(profileDocument, crawlerJavbusProfile);
|
|
|
+ crawlerJavbusProfileMapper.insertOrUpdate(crawlerJavbusProfile);
|
|
|
+ log.warn("jsoupJavbusProfileSub成功插入,uid={}", uid);
|
|
|
+
|
|
|
+ javbusLog.setStatus(2);
|
|
|
+ javbusLog.setErrorMsg("");
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.error("jsoupJavbusProfileSub插入异常,nickName={},uid={}", nickName, uid, e);
|
|
|
+ javbusLog.setStatus(3);
|
|
|
+ javbusLog.setErrorMsg(e.getMessage());
|
|
|
+ }
|
|
|
+
|
|
|
+ crawlerJavbusProfileMapper.insertOrUpdateLog(javbusLog);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
private void jsoupJavbusProfileSub(Proxy proxy, Long start, Integer limit) throws Exception {
|
|
|
CrawlerJavbusProfile latestJavbusProfile = crawlerJavbusProfileMapper.findLatestInfo();
|
|
|
if (start == null && latestJavbusProfile == null) {
|
|
|
@@ -130,76 +192,21 @@ public class Crawler4JavbusServiceImpl implements Crawler4JavbusService {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
- String avatarUrl = profileDocument.select("div.avt").select("img").attr("src");
|
|
|
String[] mbn0Arr = profileDocument.select("div.u_profile").select("div.cl").get(0).select("h2.mbn").get(0).text().replace("(", "").replace(")", "").split("UID:");
|
|
|
nickName = mbn0Arr[0].trim();
|
|
|
uid = mbn0Arr[1].trim();
|
|
|
- String emailStatus = profileDocument.select("div.u_profile").select("div.cl").get(0).select("ul").first().text().replace("郵箱狀態", "").trim();
|
|
|
-
|
|
|
- Elements signEles = profileDocument.select("div.u_profile").select("div.cl").get(0).select("ul").get(1).select("li:contains(個人簽名)");
|
|
|
- String signStr = "";
|
|
|
- ArrayList<String> signImgList = new ArrayList<>();
|
|
|
- if (signEles.size() > 0) {
|
|
|
- signStr = signEles.first().select("table").text();
|
|
|
- Elements signImgEles = signEles.first().select("table").select("img");
|
|
|
- for (Element signImgEle : signImgEles) {
|
|
|
- signImgList.add(signImgEle.attr("src"));
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- String friendNum = profileDocument.select("div.u_profile").select("div.cl").get(0).select("ul").get(2)
|
|
|
- .select("a").get(0).text().replace("好友數", "").trim();
|
|
|
- String replyNum = profileDocument.select("div.u_profile").select("div.cl").get(0).select("ul").get(2)
|
|
|
- .select("a").get(1).text().replace("回帖數", "").trim();
|
|
|
- String threadNum = profileDocument.select("div.u_profile").select("div.cl").get(0).select("ul").get(2)
|
|
|
- .select("a").get(2).text().replace("主題數", "").trim();
|
|
|
- String userGroup = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(0)
|
|
|
- .select("a").text();
|
|
|
- String onlineTime = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
|
|
|
- .select("li:contains(在線時間)").text().replace("在線時間", "").replace("小時", "").trim();
|
|
|
- String registrationTime = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
|
|
|
- .select("li:contains(註冊時間)").text().replace("註冊時間", "").trim();
|
|
|
- String lastVisit = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
|
|
|
- .select("li:contains(最後訪問)").text().replace("最後訪問", "").trim();
|
|
|
- String lastActivityTime = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
|
|
|
- .select("li:contains(上次活動時間)").text().replace("上次活動時間", "").trim();
|
|
|
- String lastPublishedTime = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
|
|
|
- .select("li:contains(上次發表時間)").text().replace("上次發表時間", "").trim();
|
|
|
- String timeZone = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
|
|
|
- .select("li:contains(所在時區)").text().replace("所在時區", "").trim();
|
|
|
- String usedSpace = profileDocument.select("div.u_profile").select("div.cl").get(2).select("ul").get(0)
|
|
|
- .select("li").get(0).text().replace("已用空間", "").replace("B", "").trim();
|
|
|
- String mileage = profileDocument.select("div.u_profile").select("div.cl").get(2).select("ul").get(0)
|
|
|
- .select("li").get(1).text().replace("里程", "").trim();
|
|
|
- String money = profileDocument.select("div.u_profile").select("div.cl").get(2).select("ul").get(0)
|
|
|
- .select("li").last().text().replace("金錢", "").trim();
|
|
|
|
|
|
CrawlerJavbusProfile crawlerJavbusProfile = new CrawlerJavbusProfile();
|
|
|
crawlerJavbusProfile.setUid(Long.valueOf(uid));
|
|
|
crawlerJavbusProfile.setNickName(nickName);
|
|
|
- crawlerJavbusProfile.setEmailStatus(emailStatus);
|
|
|
- crawlerJavbusProfile.setFriendNum(Integer.valueOf(friendNum));
|
|
|
- crawlerJavbusProfile.setReplyNum(Integer.valueOf(replyNum));
|
|
|
- crawlerJavbusProfile.setThreadNum(Integer.valueOf(threadNum));
|
|
|
- crawlerJavbusProfile.setUserGroup(userGroup);
|
|
|
- crawlerJavbusProfile.setOnlineTime(StringUtils.isNotEmpty(onlineTime) ? Integer.valueOf(onlineTime) : null);
|
|
|
- crawlerJavbusProfile.setRegistrationTime(StringUtils.isNotEmpty(registrationTime) && registrationTime.length() >= 10 ? LocalDateTime.parse(registrationTime, DateUtils.dateTimeFormatter3) : null);
|
|
|
- crawlerJavbusProfile.setLastVisit(StringUtils.isNotEmpty(lastVisit) && lastVisit.length() >= 10 ? LocalDateTime.parse(lastVisit, DateUtils.dateTimeFormatter3) : null);
|
|
|
- crawlerJavbusProfile.setLastActivityTime(StringUtils.isNotEmpty(lastActivityTime) && lastActivityTime.length() >= 10 ? LocalDateTime.parse(lastActivityTime, DateUtils.dateTimeFormatter3) : null);
|
|
|
- crawlerJavbusProfile.setLastPublishedTime(StringUtils.isNotEmpty(lastPublishedTime) && lastPublishedTime.length() >= 10 ? LocalDateTime.parse(lastPublishedTime, DateUtils.dateTimeFormatter3) : null);
|
|
|
- crawlerJavbusProfile.setTimeZone(StringUtils.isNotEmpty(timeZone) ? timeZone : null);
|
|
|
- crawlerJavbusProfile.setUsedSpace(Integer.valueOf(usedSpace));
|
|
|
- crawlerJavbusProfile.setMileage(Integer.valueOf(mileage));
|
|
|
- crawlerJavbusProfile.setMoney(Integer.valueOf(money));
|
|
|
- crawlerJavbusProfile.setAvatarUrl(avatarUrl);
|
|
|
- crawlerJavbusProfile.setSignStr(signStr);
|
|
|
- crawlerJavbusProfile.setSignImg(org.apache.commons.lang3.StringUtils.join(signImgList, ","));
|
|
|
+ parseJavbusProfile(profileDocument, crawlerJavbusProfile);
|
|
|
crawlerJavbusProfileMapper.insertOrUpdate(crawlerJavbusProfile);
|
|
|
log.warn("jsoupJavbusProfileSub成功插入,uid={}", uid);
|
|
|
} catch (Exception e) {
|
|
|
log.error("jsoupJavbusProfileSub插入异常,nickName={},uid={}", nickName, uid, e);
|
|
|
CrawlerJavbusLog crawlerJavbusLog = new CrawlerJavbusLog();
|
|
|
crawlerJavbusLog.setType(1);
|
|
|
+ crawlerJavbusLog.setStatus(1);
|
|
|
crawlerJavbusLog.setBusinessKey(uid);
|
|
|
crawlerJavbusLog.setErrorMsg(e.getMessage());
|
|
|
crawlerJavbusProfileMapper.insertOrUpdateLog(crawlerJavbusLog);
|
|
|
@@ -207,6 +214,67 @@ public class Crawler4JavbusServiceImpl implements Crawler4JavbusService {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ public void parseJavbusProfile(Document profileDocument, CrawlerJavbusProfile crawlerJavbusProfile) {
|
|
|
+ String avatarUrl = profileDocument.select("div.avt").select("img").attr("src");
|
|
|
+ String emailStatus = profileDocument.select("div.u_profile").select("div.cl").get(0).select("ul").first().text().replace("郵箱狀態", "").trim();
|
|
|
+
|
|
|
+ Elements signEles = profileDocument.select("div.u_profile").select("div.cl").get(0).select("ul").get(1).select("li:contains(個人簽名)");
|
|
|
+ String signStr = "";
|
|
|
+ ArrayList<String> signImgList = new ArrayList<>();
|
|
|
+ if (signEles.size() > 0) {
|
|
|
+ signStr = signEles.first().select("table").text();
|
|
|
+ Elements signImgEles = signEles.first().select("table").select("img");
|
|
|
+ for (Element signImgEle : signImgEles) {
|
|
|
+ signImgList.add(signImgEle.attr("src"));
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ String friendNum = profileDocument.select("div.u_profile").select("div.cl").get(0).select("ul").get(2)
|
|
|
+ .select("a").get(0).text().replace("好友數", "").trim();
|
|
|
+ String replyNum = profileDocument.select("div.u_profile").select("div.cl").get(0).select("ul").get(2)
|
|
|
+ .select("a").get(1).text().replace("回帖數", "").trim();
|
|
|
+ String threadNum = profileDocument.select("div.u_profile").select("div.cl").get(0).select("ul").get(2)
|
|
|
+ .select("a").get(2).text().replace("主題數", "").trim();
|
|
|
+ String userGroup = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(0)
|
|
|
+ .select("a").text();
|
|
|
+ String onlineTime = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
|
|
|
+ .select("li:contains(在線時間)").text().replace("在線時間", "").replace("小時", "").trim();
|
|
|
+ String registrationTime = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
|
|
|
+ .select("li:contains(註冊時間)").text().replace("註冊時間", "").trim();
|
|
|
+ String lastVisit = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
|
|
|
+ .select("li:contains(最後訪問)").text().replace("最後訪問", "").trim();
|
|
|
+ String lastActivityTime = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
|
|
|
+ .select("li:contains(上次活動時間)").text().replace("上次活動時間", "").trim();
|
|
|
+ String lastPublishedTime = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
|
|
|
+ .select("li:contains(上次發表時間)").text().replace("上次發表時間", "").trim();
|
|
|
+ String timeZone = profileDocument.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
|
|
|
+ .select("li:contains(所在時區)").text().replace("所在時區", "").trim();
|
|
|
+ String usedSpace = profileDocument.select("div.u_profile").select("div.cl").get(2).select("ul").get(0)
|
|
|
+ .select("li").get(0).text().replace("已用空間", "").replace("B", "").trim();
|
|
|
+ String mileage = profileDocument.select("div.u_profile").select("div.cl").get(2).select("ul").get(0)
|
|
|
+ .select("li").get(1).text().replace("里程", "").trim();
|
|
|
+ String money = profileDocument.select("div.u_profile").select("div.cl").get(2).select("ul").get(0)
|
|
|
+ .select("li").last().text().replace("金錢", "").trim();
|
|
|
+
|
|
|
+ crawlerJavbusProfile.setEmailStatus(emailStatus);
|
|
|
+ crawlerJavbusProfile.setFriendNum(Integer.valueOf(friendNum));
|
|
|
+ crawlerJavbusProfile.setReplyNum(Integer.valueOf(replyNum));
|
|
|
+ crawlerJavbusProfile.setThreadNum(Integer.valueOf(threadNum));
|
|
|
+ crawlerJavbusProfile.setUserGroup(userGroup);
|
|
|
+ crawlerJavbusProfile.setOnlineTime(StringUtils.isNotEmpty(onlineTime) ? Integer.valueOf(onlineTime) : null);
|
|
|
+ crawlerJavbusProfile.setRegistrationTime(StringUtils.isNotEmpty(registrationTime) && registrationTime.length() >= 10 ? LocalDateTime.parse(registrationTime, DateUtils.dateTimeFormatter3) : null);
|
|
|
+ crawlerJavbusProfile.setLastVisit(StringUtils.isNotEmpty(lastVisit) && lastVisit.length() >= 10 ? LocalDateTime.parse(lastVisit, DateUtils.dateTimeFormatter3) : null);
|
|
|
+ crawlerJavbusProfile.setLastActivityTime(StringUtils.isNotEmpty(lastActivityTime) && lastActivityTime.length() >= 10 ? LocalDateTime.parse(lastActivityTime, DateUtils.dateTimeFormatter3) : null);
|
|
|
+ crawlerJavbusProfile.setLastPublishedTime(StringUtils.isNotEmpty(lastPublishedTime) && lastPublishedTime.length() >= 10 ? LocalDateTime.parse(lastPublishedTime, DateUtils.dateTimeFormatter3) : null);
|
|
|
+ crawlerJavbusProfile.setTimeZone(StringUtils.isNotEmpty(timeZone) ? timeZone : null);
|
|
|
+ crawlerJavbusProfile.setUsedSpace(Integer.valueOf(usedSpace));
|
|
|
+ crawlerJavbusProfile.setMileage(Integer.valueOf(mileage));
|
|
|
+ crawlerJavbusProfile.setMoney(Integer.valueOf(money));
|
|
|
+ crawlerJavbusProfile.setAvatarUrl(avatarUrl);
|
|
|
+ crawlerJavbusProfile.setSignStr(signStr);
|
|
|
+ crawlerJavbusProfile.setSignImg(org.apache.commons.lang3.StringUtils.join(signImgList, ","));
|
|
|
+ }
|
|
|
+
|
|
|
private boolean generateJavbusCookies(Proxy proxy) throws Exception {
|
|
|
// 1 登陆获取cookies
|
|
|
// 1.0 https://www.javbus.com/forum/forum.php
|