|
|
@@ -0,0 +1,261 @@
|
|
|
+package top.lvzhiqiang.service.impl;
|
|
|
+
|
|
|
+import com.alibaba.fastjson.JSONArray;
|
|
|
+import com.alibaba.fastjson.JSONObject;
|
|
|
+import lombok.extern.slf4j.Slf4j;
|
|
|
+import org.jsoup.Connection;
|
|
|
+import org.jsoup.Jsoup;
|
|
|
+import org.springframework.beans.factory.annotation.Value;
|
|
|
+import org.springframework.stereotype.Service;
|
|
|
+import org.springframework.transaction.annotation.Propagation;
|
|
|
+import org.springframework.transaction.annotation.Transactional;
|
|
|
+import org.springframework.util.StopWatch;
|
|
|
+import top.lvzhiqiang.entity.CrawlerXiaoeknowCourse;
|
|
|
+import top.lvzhiqiang.entity.DicCode;
|
|
|
+import top.lvzhiqiang.exception.BusinessException;
|
|
|
+import top.lvzhiqiang.mapper.CrawlerXiaoeknowCourseMapper;
|
|
|
+import top.lvzhiqiang.mapper.DicCodeMapper;
|
|
|
+import top.lvzhiqiang.service.CrawlerService;
|
|
|
+import top.lvzhiqiang.util.DateUtils;
|
|
|
+
|
|
|
+import javax.annotation.Resource;
|
|
|
+import java.io.IOException;
|
|
|
+import java.time.LocalDate;
|
|
|
+import java.time.LocalDateTime;
|
|
|
+import java.util.*;
|
|
|
+import java.util.stream.Collectors;
|
|
|
+
|
|
|
+/**
|
|
|
+ * Crawler ServiceImpl
|
|
|
+ *
|
|
|
+ * @author lvzhiqiang
|
|
|
+ * 2022/9/10 21:24
|
|
|
+ */
|
|
|
+@Service
|
|
|
+@Slf4j
|
|
|
+public class CrawlerServiceImpl implements CrawlerService {
|
|
|
+
|
|
|
+ @Resource
|
|
|
+ private CrawlerXiaoeknowCourseMapper crawlerXiaoeknowCourseMapper;
|
|
|
+ @Resource
|
|
|
+ private DicCodeMapper dicCodeMapper;
|
|
|
+
|
|
|
+ @Value("${spring.profiles.active}")
|
|
|
+ private String env;
|
|
|
+
|
|
|
+ private final int defaultPageSize = 8;
|
|
|
+
|
|
|
+ /**
|
|
|
+ * findXiaoeknowCourse
|
|
|
+ *
|
|
|
+ * @author lvzhiqiang
|
|
|
+ * 2022/9/11 17:01
|
|
|
+ */
|
|
|
+ @Override
|
|
|
+ public String findXiaoeknowCourse(String title, Integer type, String orderField, String order, String crudT) {
|
|
|
+ if ("2".equals(crudT)) {
|
|
|
+ //更新
|
|
|
+ return "success";
|
|
|
+ }
|
|
|
+ if ("3".equals(crudT)) {
|
|
|
+ //删除
|
|
|
+ return "success";
|
|
|
+ }
|
|
|
+
|
|
|
+ Map<String, Object> params = new HashMap<>();
|
|
|
+ params.put("title", title);
|
|
|
+ params.put("type", type);
|
|
|
+ params.put("orderField", orderField);
|
|
|
+ params.put("order", order);
|
|
|
+ List<CrawlerXiaoeknowCourse> crawlerXiaoeknowCourseList = crawlerXiaoeknowCourseMapper.findXiaoeknowCourse4MultipleParams(params);
|
|
|
+
|
|
|
+ StringBuffer sb = new StringBuffer("total:".concat(String.valueOf(crawlerXiaoeknowCourseList.size())).concat("<br/>"));
|
|
|
+ sb.append("<table border=\"1\" cellspacing=\"0\"><tr><th>resourceTitle</th><th>resourceType</th><th>viewCount</th><th>startAt</th><th>appId</th><th>columnId</th><th>resourceId</th></tr>");
|
|
|
+
|
|
|
+ String videoTemplateUrl = "https://appId.h5.xiaoeknow.com/p/course/video/resourceId?product_id=columnId";
|
|
|
+ String liveTemplateUrl = "https://appId.h5.xiaoeknow.com/v2/course/alive/resourceId?type=2&pro_id=columnId&app_id=appId";
|
|
|
+ for (CrawlerXiaoeknowCourse crawlerXiaoeknowCourse : crawlerXiaoeknowCourseList) {
|
|
|
+ sb.append("<tr>");
|
|
|
+
|
|
|
+ String resourceType = "";
|
|
|
+ String url = "";
|
|
|
+ if (3 == crawlerXiaoeknowCourse.getResourceType()) {
|
|
|
+ resourceType = "视频";
|
|
|
+ url = videoTemplateUrl.replace("appId", crawlerXiaoeknowCourse.getAppId()).replace("columnId", crawlerXiaoeknowCourse.getColumnId()).replace("resourceId", crawlerXiaoeknowCourse.getResourceId());
|
|
|
+ } else if (4 == crawlerXiaoeknowCourse.getResourceType()) {
|
|
|
+ resourceType = "直播";
|
|
|
+ url = liveTemplateUrl.replace("appId", crawlerXiaoeknowCourse.getAppId()).replace("columnId", crawlerXiaoeknowCourse.getColumnId()).replace("resourceId", crawlerXiaoeknowCourse.getResourceId());
|
|
|
+ }
|
|
|
+ sb.append("<td><a target=\"_blank\" href=\"" + url + "\">").append(crawlerXiaoeknowCourse.getResourceTitle()).append("</a></td>");
|
|
|
+
|
|
|
+ sb.append("<td>").append(resourceType).append("</td>");
|
|
|
+
|
|
|
+ sb.append("<td>").append(crawlerXiaoeknowCourse.getViewCount()).append("</td>");
|
|
|
+ sb.append("<td>").append(crawlerXiaoeknowCourse.getStartAt()).append("</td>");
|
|
|
+ sb.append("<td>").append(crawlerXiaoeknowCourse.getAppId()).append("</td>");
|
|
|
+ sb.append("<td>").append(crawlerXiaoeknowCourse.getColumnId()).append("</td>");
|
|
|
+ sb.append("<td>").append(crawlerXiaoeknowCourse.getResourceId()).append("</td>");
|
|
|
+
|
|
|
+ sb.append("</tr>");
|
|
|
+ }
|
|
|
+ sb.append("</table>");
|
|
|
+
|
|
|
+ return sb.toString();
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * jsoupXiaoeknowCourse
|
|
|
+ *
|
|
|
+ * @author lvzhiqiang
|
|
|
+ * 2022/9/10 21:24
|
|
|
+ */
|
|
|
+ //@Async
|
|
|
+ @Override
|
|
|
+ @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Exception.class)
|
|
|
+ public void jsoupXiaoeknowCourse(Integer status, Integer isDel, Integer ignoreRetryCount) {
|
|
|
+ log.warn("jsoupXiaoeknowCourse 开始:status={},isDel={},ignoreRetryCount={}", status, isDel, ignoreRetryCount);
|
|
|
+ StopWatch stopWatch = new StopWatch();
|
|
|
+ stopWatch.start();
|
|
|
+ if (isDel == 1) {
|
|
|
+ crawlerXiaoeknowCourseMapper.deleteAll();
|
|
|
+ }
|
|
|
+
|
|
|
+ String itemsURL = "https://aaaaaaaaaaaaaaaaaaaa.h5.xiaoeknow.com/xe.course.business.column.items.get/2.0.0";
|
|
|
+ String baseInfoURL = "https://aaaaaaaaaaaaaaaaaaaa.h5.xiaoeknow.com/xe.course.business.column.base_info.get/2.0.0";
|
|
|
+
|
|
|
+ // 获取课程参数
|
|
|
+ List<DicCode> dicCodeList = dicCodeMapper.findAll().stream().filter(x -> "xiaoeknow_url".equals(x.getCodeDesc()) && x.getEnv().contains(env)).collect(Collectors.toList());
|
|
|
+
|
|
|
+ Map<String, String> params = new HashMap<>();
|
|
|
+ int insertTotalNum = 0;
|
|
|
+ for (DicCode dicCode : dicCodeList) {
|
|
|
+ String appId = dicCode.getCodeKey();
|
|
|
+ String[] columnIdArr = dicCode.getCodeValue().split(",");
|
|
|
+
|
|
|
+ itemsURL = itemsURL.replace("aaaaaaaaaaaaaaaaaaaa", appId);
|
|
|
+ baseInfoURL = baseInfoURL.replace("aaaaaaaaaaaaaaaaaaaa", appId);
|
|
|
+ for (String columnId : columnIdArr) {
|
|
|
+ params.put("bizData[column_id]", columnId);
|
|
|
+
|
|
|
+ // 获取最新的一条
|
|
|
+ CrawlerXiaoeknowCourse latestInfo = crawlerXiaoeknowCourseMapper.findLatestInfo(appId, columnId);
|
|
|
+ LocalDate latestDate = latestInfo == null ? LocalDate.of(1970, 1, 1) : latestInfo.getStartAt();
|
|
|
+
|
|
|
+ StringBuffer referer = new StringBuffer("https://");
|
|
|
+ referer.append(appId).append(".h5.xiaoeknow.com/p/course/column/").append(columnId).append("?type=3");
|
|
|
+
|
|
|
+ // 获取总数
|
|
|
+ Connection.Response response;
|
|
|
+ int total = 0;
|
|
|
+ Map<String, String> cookies;
|
|
|
+ try {
|
|
|
+ cookies = getCookies(referer.toString());
|
|
|
+ response = Jsoup.connect(baseInfoURL)
|
|
|
+ .header("Content-Type", "application/x-www-form-urlencoded")
|
|
|
+ .timeout(50000)
|
|
|
+ //.proxy()
|
|
|
+ .data(params)
|
|
|
+ .ignoreContentType(true)
|
|
|
+ .userAgent(getUserAgent())
|
|
|
+ .header("referer", referer.toString())
|
|
|
+ .cookies(cookies)
|
|
|
+ .method(Connection.Method.POST)
|
|
|
+ .execute();
|
|
|
+
|
|
|
+ JSONObject result = JSONObject.parseObject(response.body());
|
|
|
+ total = result.getJSONObject("data").getInteger("resource_count");
|
|
|
+ log.warn("jsoupXiaoeknowCourse 获取总数:appId={},columnId={},total={}", appId, columnId, total);
|
|
|
+ } catch (Exception e) {
|
|
|
+ e.printStackTrace();
|
|
|
+ throw new BusinessException(500, e.getMessage());
|
|
|
+ }
|
|
|
+
|
|
|
+ params.put("bizData[page_size]", String.valueOf(defaultPageSize));
|
|
|
+ params.put("bizData[sort]", "desc");
|
|
|
+ int lastPageNo = (total / defaultPageSize) + (total % defaultPageSize > 0 ? 1 : 0);
|
|
|
+ List<CrawlerXiaoeknowCourse> xiaoeknowCourseList = new ArrayList<>();
|
|
|
+ outer:
|
|
|
+ for (int currentPageNo = 1; currentPageNo <= lastPageNo; currentPageNo++) {
|
|
|
+ params.put("bizData[page_index]", String.valueOf(currentPageNo));
|
|
|
+ try {
|
|
|
+ response = Jsoup.connect(itemsURL)
|
|
|
+ .header("Content-Type", "application/x-www-form-urlencoded")
|
|
|
+ .timeout(50000)
|
|
|
+ //.proxy()
|
|
|
+ .data(params)
|
|
|
+ .ignoreContentType(true)
|
|
|
+ .userAgent(getUserAgent())
|
|
|
+ .header("referer", referer.toString())
|
|
|
+ .cookies(cookies)
|
|
|
+ .method(Connection.Method.POST)
|
|
|
+ .execute();
|
|
|
+
|
|
|
+ JSONObject result = JSONObject.parseObject(response.body());
|
|
|
+ JSONArray jsonArray = result.getJSONObject("data").getJSONArray("list");
|
|
|
+ int currentPageIndex = 0;
|
|
|
+ for (Object o : jsonArray) {
|
|
|
+ JSONObject jo = (JSONObject) o;
|
|
|
+ ++currentPageIndex;
|
|
|
+
|
|
|
+ LocalDate startAt = LocalDate.parse(jo.getString("start_at").substring(0, 10), DateUtils.dateFormatter2);
|
|
|
+ if (startAt.isBefore(latestDate) || startAt.isEqual(latestDate)) {
|
|
|
+ break outer;
|
|
|
+ }
|
|
|
+
|
|
|
+ CrawlerXiaoeknowCourse crawlerXiaoeknowCourse = new CrawlerXiaoeknowCourse();
|
|
|
+ crawlerXiaoeknowCourse.setAppId(appId);
|
|
|
+ crawlerXiaoeknowCourse.setColumnId(columnId);
|
|
|
+ crawlerXiaoeknowCourse.setResourceId(jo.getString("resource_id"));
|
|
|
+ crawlerXiaoeknowCourse.setResourceTitle(jo.getString("resource_title"));
|
|
|
+ crawlerXiaoeknowCourse.setResourceType(jo.getInteger("resource_type"));
|
|
|
+ crawlerXiaoeknowCourse.setViewCount(jo.getInteger("view_count"));
|
|
|
+ crawlerXiaoeknowCourse.setStartAt(startAt);
|
|
|
+ crawlerXiaoeknowCourse.setCreateTime(LocalDateTime.now());
|
|
|
+ xiaoeknowCourseList.add(crawlerXiaoeknowCourse);
|
|
|
+ log.warn("jsoupXiaoeknowCourse success:currentPageNo={},currentPageIndex={},resourceTitle={}", currentPageNo, currentPageIndex, crawlerXiaoeknowCourse.getResourceTitle());
|
|
|
+ }
|
|
|
+ } catch (Exception e) {
|
|
|
+ e.printStackTrace();
|
|
|
+ log.error("jsoupXiaoeknowCourse error,params={}", params, e);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (xiaoeknowCourseList.size() > 0) {
|
|
|
+ crawlerXiaoeknowCourseMapper.insertList(xiaoeknowCourseList);
|
|
|
+ insertTotalNum += xiaoeknowCourseList.size();
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ stopWatch.stop();
|
|
|
+ log.warn("jsoupXiaoeknowCourse 结束:insertTotalNum={},耗时={}", insertTotalNum, stopWatch.getTotalTimeSeconds());
|
|
|
+ }
|
|
|
+
|
|
|
+ private Map<String, String> getCookies(String url) throws IOException {
|
|
|
+ Connection.Response res1 = Jsoup.connect(url).method(Connection.Method.GET).execute();
|
|
|
+ return res1.cookies();
|
|
|
+ }
|
|
|
+
|
|
|
+ private String getUserAgent() {
|
|
|
+ Random r = new Random();
|
|
|
+ String[] ua = {"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
|
|
|
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36 OPR/37.0.2178.32",
|
|
|
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
|
|
|
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
|
|
|
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586",
|
|
|
+ "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
|
|
|
+ "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
|
|
|
+ "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",
|
|
|
+ "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)",
|
|
|
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.3 Safari/537.36",
|
|
|
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.277.400 QQBrowser/9.4.7658.400",
|
|
|
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 UBrowser/5.6.12150.8 Safari/537.36",
|
|
|
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0",
|
|
|
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 TheWorld 7",
|
|
|
+ "Mozilla/5.0 (Windows NT 6.1; W…) Gecko/20100101 Firefox/60.0",
|
|
|
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"};
|
|
|
+ int i = r.nextInt(15);
|
|
|
+ return ua[i];
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+
|