|
|
@@ -5,7 +5,9 @@ import com.github.pagehelper.PageHelper;
|
|
|
import com.github.pagehelper.PageInfo;
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
import net.coobird.thumbnailator.Thumbnails;
|
|
|
+import net.coobird.thumbnailator.tasks.UnsupportedFormatException;
|
|
|
import org.jsoup.Connection;
|
|
|
+import org.jsoup.HttpStatusException;
|
|
|
import org.jsoup.Jsoup;
|
|
|
import org.jsoup.nodes.Document;
|
|
|
import org.jsoup.nodes.Element;
|
|
|
@@ -36,6 +38,7 @@ import java.io.*;
|
|
|
import java.math.BigDecimal;
|
|
|
import java.math.RoundingMode;
|
|
|
import java.net.Proxy;
|
|
|
+import java.net.SocketTimeoutException;
|
|
|
import java.net.URLDecoder;
|
|
|
import java.time.LocalDate;
|
|
|
import java.util.ArrayList;
|
|
|
@@ -317,6 +320,7 @@ public class PictureInfoServiceImpl extends BaseServiceImpl<Object> implements P
|
|
|
crawlerImageLog.setPublishTime(publishTime);
|
|
|
int count = pictureInfoMapper.insertIgnoreFileCrawlerImageLog(crawlerImageLog);
|
|
|
findCount += count;
|
|
|
+ log.warn("jsoupFulibaPic item success:publishTime={},mainTitle={}", publishTime, mainTitle);
|
|
|
}
|
|
|
|
|
|
// 继续下一页
|
|
|
@@ -371,11 +375,18 @@ public class PictureInfoServiceImpl extends BaseServiceImpl<Object> implements P
|
|
|
Thread.sleep(5000L);
|
|
|
|
|
|
SpringUtils.getBean(PictureInfoServiceImpl.class).jsoupFulibaPicDetailSub(fileCrawlerImageLog.getMainUrl(), headerMap, fileCrawlerImageLog.getPublishTime(), fileCrawlerImageLog.getId());
|
|
|
+ if (2 == fileCrawlerImageLog.getStatus()) {
|
|
|
+ fileCrawlerImageLog.setFailureCause("");
|
|
|
+ }
|
|
|
fileCrawlerImageLog.setStatus(1);
|
|
|
successCount++;
|
|
|
} catch (Exception e) {
|
|
|
- fileCrawlerImageLog.setFailureCause(e.getMessage());
|
|
|
- fileCrawlerImageLog.setStatus(2);
|
|
|
+ fileCrawlerImageLog.setFailureCause(e.getMessage().length() > 200 ? e.getMessage().substring(0, 200) : e.getMessage());
|
|
|
+ if (e.getMessage().contains("timeoutCount equal imgEles size")) {
|
|
|
+ fileCrawlerImageLog.setStatus(4);
|
|
|
+ } else {
|
|
|
+ fileCrawlerImageLog.setStatus(2);
|
|
|
+ }
|
|
|
failCount++;
|
|
|
} finally {
|
|
|
pictureInfoMapper.insertOrUpdateFileCrawlerImageLog(fileCrawlerImageLog);
|
|
|
@@ -398,21 +409,35 @@ public class PictureInfoServiceImpl extends BaseServiceImpl<Object> implements P
|
|
|
String ftpThumbnailCrawlerBasePath = InitRunner.dicCodeMap.get("ftp_thumbnail_crawler_basepath").getCodeValue();
|
|
|
List<String> delPathList = new ArrayList<>();
|
|
|
String srcUrl = "";
|
|
|
+ List<FileCrawlerImage> fileCrawlerImageList = new ArrayList<>();
|
|
|
try {
|
|
|
fulibaDetailDocument = JsoupUtil.requestDocument(mainUrl, JsoupUtil.HTTP_GET, Proxy.NO_PROXY, null, headerMap, null);
|
|
|
- log.warn("jsoupFulibaPicDetailSub start:mainUrl={},logId={}", mainUrl, logId);
|
|
|
+ log.warn("jsoupFulibaPicDetailSub start:mainUrl={},publishTime={},logId={}", mainUrl, publishTime, logId);
|
|
|
Elements imgEles = fulibaDetailDocument.select("div.content > article.article-content").select("img");
|
|
|
int i = 0;
|
|
|
String parentPath = "1" + File.separator + publishTime.format(DateUtils.dateFormatter5);
|
|
|
+ Connection.Response response;
|
|
|
+ int timeoutCount = 0;
|
|
|
for (Element imgEle : imgEles) {
|
|
|
srcUrl = imgEle.attr("src");
|
|
|
String altTitle = imgEle.attr("alt");
|
|
|
-
|
|
|
newName = FtpUtil.genImageName();
|
|
|
String prefx = srcUrl.substring(srcUrl.lastIndexOf("."));
|
|
|
newName = newName + prefx;
|
|
|
+ try {
|
|
|
+ response = Jsoup.connect(srcUrl).method(Connection.Method.GET).ignoreContentType(true).timeout(50 * 1000).execute();
|
|
|
+ } catch (SocketTimeoutException ioex) {
|
|
|
+ timeoutCount++;
|
|
|
+ continue;
|
|
|
+ } catch (HttpStatusException ioex) {
|
|
|
+ try {
|
|
|
+ response = Jsoup.connect(srcUrl).method(Connection.Method.GET).ignoreContentType(true).timeout(50 * 1000).execute();
|
|
|
+ } catch (Exception e) {
|
|
|
+ timeoutCount++;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ }
|
|
|
|
|
|
- Connection.Response response = Jsoup.connect(srcUrl).method(Connection.Method.GET).ignoreContentType(true).timeout(50 * 1000).execute();
|
|
|
byte[] imageBytes = response.bodyAsBytes();
|
|
|
if (imageBytes.length == 0) {
|
|
|
// 过滤掉失效的图片链接
|
|
|
@@ -433,16 +458,21 @@ public class PictureInfoServiceImpl extends BaseServiceImpl<Object> implements P
|
|
|
fileCrawlerImage.setCategoryId(1L);
|
|
|
fileCrawlerImage.setOrginUrl(srcUrl);
|
|
|
fileCrawlerImage.setLogId(logId);
|
|
|
- fileCrawlerImage.setSort(++i);
|
|
|
- int count = pictureInfoMapper.insertIgnoreFileImage(fileCrawlerImage);
|
|
|
- if (count > 0) {
|
|
|
+ if (true) {
|
|
|
+ InputStream imageStream2 = new ByteArrayInputStream(imageBytes);
|
|
|
+ ByteArrayOutputStream thumbnailOutputStream = new ByteArrayOutputStream();
|
|
|
+ try {
|
|
|
+ Thumbnails.of(imageStream2).size(300, 200).toOutputStream(thumbnailOutputStream);
|
|
|
+ } catch (UnsupportedFormatException unsupportedFormatException) {
|
|
|
+ imageStream2.close();
|
|
|
+ thumbnailOutputStream.close();
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
InputStream imageStream1 = new ByteArrayInputStream(imageBytes);
|
|
|
FtpUtil.uploadFile(ftpImageCrawlerBasePath, parentPath, newName, imageStream1);
|
|
|
delPathList.add(ftpImageCrawlerBasePath + imageUrl);
|
|
|
|
|
|
- InputStream imageStream2 = new ByteArrayInputStream(imageBytes);
|
|
|
- ByteArrayOutputStream thumbnailOutputStream = new ByteArrayOutputStream();
|
|
|
- Thumbnails.of(imageStream2).size(300, 200).toOutputStream(thumbnailOutputStream);
|
|
|
ByteArrayInputStream thumbnailInputStream = new ByteArrayInputStream(thumbnailOutputStream.toByteArray());
|
|
|
FtpUtil.uploadFile(ftpThumbnailCrawlerBasePath, parentPath, newName, thumbnailInputStream);
|
|
|
delPathList.add(ftpThumbnailCrawlerBasePath + imageUrl);
|
|
|
@@ -450,7 +480,21 @@ public class PictureInfoServiceImpl extends BaseServiceImpl<Object> implements P
|
|
|
imageStream2.close();
|
|
|
thumbnailOutputStream.close();
|
|
|
}
|
|
|
+
|
|
|
+ fileCrawlerImage.setSort(++i);
|
|
|
+ fileCrawlerImageList.add(fileCrawlerImage);
|
|
|
+ }
|
|
|
+
|
|
|
+ if (!imgEles.isEmpty() && timeoutCount == imgEles.size()) {
|
|
|
+ log.warn("jsoupFulibaPicDetailSub timeoutCount is equals imgEles size,mainUrl={},publishTime={},timeoutCount={}", mainUrl, publishTime, timeoutCount);
|
|
|
+ throw new BusinessException(ResultCodeEnum.UNKNOWN_ERROR.getCode(), "timeoutCount equal imgEles size");
|
|
|
}
|
|
|
+ if (!imgEles.isEmpty() && timeoutCount > 4) {
|
|
|
+ log.warn("jsoupFulibaPicDetailSub timeoutCount is Too many,mainUrl={},publishTime={},timeoutCount={}", mainUrl, publishTime, timeoutCount);
|
|
|
+ throw new BusinessException(ResultCodeEnum.UNKNOWN_ERROR.getCode(), "timeoutCount is Too many");
|
|
|
+ }
|
|
|
+
|
|
|
+ int count = pictureInfoMapper.insertIgnoreFileImageList(fileCrawlerImageList);
|
|
|
} catch (Exception e) {
|
|
|
// 异常,删除已经上传的文件
|
|
|
if (!delPathList.isEmpty()) {
|