||
- import com.alibaba.fastjson.JSONObject;
- import org.apache.commons.lang3.StringUtils;
- import org.jsoup.Connection;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- import top.lvzhiqiang.entity.CrawlerJavbusProfile;
- import top.lvzhiqiang.util.DateUtils;
- import top.lvzhiqiang.util.JsoupUtil;
- import java.io.*;
- import java.net.HttpURLConnection;
- import java.net.InetSocketAddress;
- import java.net.Proxy;
- import java.net.URL;
- import java.time.LocalDateTime;
- import java.util.*;
- public class Test4Javbus {
- public static void main(String[] args) throws Exception {
- //setupOne();
- setupTwo();
- }
- private static void setupTwo() throws Exception {
- File file = new File("d:\\zhiqiang.lv\\Desktop", "1.html");
- // file = new File("C:\\Users\\l1024v\\Desktop", "1.html");
- Document document = Jsoup.parse(file, "UTF-8");
- String avatarUrl = document.select("div.avt").select("img").attr("src");
- String[] mbn0Arr = document.select("div.u_profile").select("div.cl").get(0).select("h2.mbn").get(0).text().replace("(", "").replace(")", "").split("UID:");
- String nickName = mbn0Arr[0].trim();
- String uid = mbn0Arr[1].trim();
- String emailStatus = document.select("div.u_profile").select("div.cl").get(0).select("ul").first().text().replace("郵箱狀態", "").trim();
- Elements signEles = document.select("div.u_profile").select("div.cl").get(0).select("ul").get(1).select("li:contains(個人簽名)");
- String signStr = "";
- ArrayList<String> signImgList = new ArrayList<>();
- if (signEles.size() > 0) {
- signStr = signEles.first().select("table").text();
- Elements signImgEles = signEles.first().select("table").select("img");
- for (Element signImgEle : signImgEles) {
- signImgList.add(signImgEle.attr("src"));
- }
- }
- String friendNum = document.select("div.u_profile").select("div.cl").get(0).select("ul").get(2)
- .select("a").get(0).text().replace("好友數", "").trim();
- String replyNum = document.select("div.u_profile").select("div.cl").get(0).select("ul").get(2)
- .select("a").get(1).text().replace("回帖數", "").trim();
- String threadNum = document.select("div.u_profile").select("div.cl").get(0).select("ul").get(2)
- .select("a").get(2).text().replace("主題數", "").trim();
- String userGroup = document.select("div.u_profile").select("div.cl").get(1).select("ul").get(0)
- .select("a").text();
- String onlineTime = document.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
- .select("li:contains(在線時間)").text().replace("在線時間", "").replace("小時", "").trim();
- String registrationTime = document.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
- .select("li:contains(註冊時間)").text().replace("註冊時間", "").trim();
- String lastVisit = document.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
- .select("li:contains(最後訪問)").text().replace("最後訪問", "").trim();
- String lastActivityTime = document.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
- .select("li:contains(上次活動時間)").text().replace("上次活動時間", "").trim();
- String lastPublishedTime = document.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
- .select("li:contains(上次發表時間)").text().replace("上次發表時間", "").trim();
- String timeZone = document.select("div.u_profile").select("div.cl").get(1).select("ul").get(1)
- .select("li:contains(所在時區)").text().replace("所在時區", "").trim();
- String usedSpace = document.select("div.u_profile").select("div.cl").get(2).select("ul").get(0)
- .select("li").get(0).text().replace("已用空間", "").replace("B", "").trim();
- String mileage = document.select("div.u_profile").select("div.cl").get(2).select("ul").get(0)
- .select("li").get(1).text().replace("里程", "").trim();
- String money = document.select("div.u_profile").select("div.cl").get(2).select("ul").get(0)
- .select("li").last().text().replace("金錢", "").trim();
- CrawlerJavbusProfile crawlerJavbusProfile = new CrawlerJavbusProfile();
- crawlerJavbusProfile.setUid(Long.valueOf(uid));
- crawlerJavbusProfile.setNickName(nickName);
- crawlerJavbusProfile.setEmailStatus(emailStatus);
- crawlerJavbusProfile.setFriendNum(Integer.valueOf(friendNum));
- crawlerJavbusProfile.setReplyNum(Integer.valueOf(replyNum));
- crawlerJavbusProfile.setThreadNum(Integer.valueOf(threadNum));
- crawlerJavbusProfile.setUserGroup(userGroup);
- crawlerJavbusProfile.setOnlineTime(top.lvzhiqiang.util.StringUtils.isNotEmpty(onlineTime) ? Integer.valueOf(onlineTime) : null);
- crawlerJavbusProfile.setRegistrationTime(top.lvzhiqiang.util.StringUtils.isNotEmpty(registrationTime) ? LocalDateTime.parse(registrationTime, DateUtils.dateTimeFormatter3) : null);
- crawlerJavbusProfile.setLastVisit(top.lvzhiqiang.util.StringUtils.isNotEmpty(lastVisit) ? LocalDateTime.parse(lastVisit, DateUtils.dateTimeFormatter3) : null);
- crawlerJavbusProfile.setLastActivityTime(top.lvzhiqiang.util.StringUtils.isNotEmpty(lastActivityTime) ? LocalDateTime.parse(lastActivityTime, DateUtils.dateTimeFormatter3) : null);
- crawlerJavbusProfile.setLastPublishedTime(top.lvzhiqiang.util.StringUtils.isNotEmpty(lastPublishedTime) ? LocalDateTime.parse(lastPublishedTime, DateUtils.dateTimeFormatter3) : null);
- crawlerJavbusProfile.setTimeZone(top.lvzhiqiang.util.StringUtils.isNotEmpty(timeZone) ? timeZone : null);
- crawlerJavbusProfile.setUsedSpace(Integer.valueOf(usedSpace));
- crawlerJavbusProfile.setMileage(Integer.valueOf(mileage));
- crawlerJavbusProfile.setMoney(Integer.valueOf(money));
- crawlerJavbusProfile.setAvatarUrl(avatarUrl);
- crawlerJavbusProfile.setSignStr(signStr);
- crawlerJavbusProfile.setSignImg(org.apache.commons.lang3.StringUtils.join(signImgList, ","));
- System.out.println(crawlerJavbusProfile);
- }
- private static void setupOne() throws Exception {
- // 代理及TOKEN设置
- Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("127.0.0.1", 1080));
- // 1 登陆获取cookies
- // 1.0 https://www.javbus.com/forum/forum.php
- Connection.Response forumResponse = JsoupUtil.requestBody("https://www.javbus.com/forum/forum.php", JsoupUtil.HTTP_GET, proxy, null);
- Map<String, String> forumCookies = forumResponse.cookies();
- System.out.println("forumCookies=" + forumCookies);
- // 1.1 https://www.javbus.com/forum/member.php
- String memberUrl = "https://www.javbus.com/forum/member.php";
- Map<String, String> params = new HashMap<>();
- params.put("mod", "logging");
- params.put("action", "login");
- params.put("referer", "");
- params.put("infloat", "yes");
- params.put("handlekey", "login");
- params.put("inajax", "1");
- params.put("ajaxtarget", "fwin_content_login");
- String memberHtmlStr = JsoupUtil.requestDocument(memberUrl, JsoupUtil.HTTP_GET, proxy, forumCookies, null, params).html().replace("<![CDATA[", "").replace("]]>", "");
- Document memberDocument = Jsoup.parse(memberHtmlStr);
- String key1 = memberDocument.select("input[type='password']").first().attr("id").split("_")[1];
- String key2 = memberDocument.select("span[id^='seccode']").first().attr("id").split("_")[1];
- String key3 = memberDocument.select("input[name='formhash']").first().val();
- // 1.2 https://www.javbus.com/forum/misc.php
- String miscUrl = "https://www.javbus.com/forum/misc.php";
- params.clear();
- params.put("mod", "seccode");
- params.put("action", "update");
- params.put("idhash", key2);
- params.put("modid", "member::logging");
- Document miscDocument = JsoupUtil.requestDocument(miscUrl, JsoupUtil.HTTP_GET, proxy, forumCookies, null, params);
- String imgVerifyUrl = "https://www.javbus.com/forum/" + miscDocument.select("img[onclick]").first().attr("src");
- System.out.println("imgVerifyUrl=" + imgVerifyUrl);
- // 1.3 get imgVerifyUrl
- Map<String, String> headerParams = new HashMap<>();
- headerParams.put("referer", "https://www.javbus.com/forum/forum.php");
- Connection.Response imgResponse = JsoupUtil.requestBody(imgVerifyUrl, JsoupUtil.HTTP_GET, proxy, forumCookies, headerParams, null);
- byte[] imgBytes = imgResponse.bodyAsBytes();
- Map<String, String> imgCookies = imgResponse.cookies();
- System.out.println("imgCookies=" + imgCookies);
- String cookieKey4Seccode = "";
- for (Map.Entry<String, String> imgCookie : imgCookies.entrySet()) {
- if (imgCookie.getKey().contains("seccode")) {
- cookieKey4Seccode = imgCookie.getKey();
- break;
- }
- }
- // 1.4 get imgVerifyNumber by BaiduOCR
- {
- //一次最多读取1k
- byte[] buffer = new byte[1024];
- //实际读取的长度
- int readLenghth;
- //创建的一个写出的缓冲流
- File savePathPrexFile = new File("d:\\zhiqiang.lv\\Desktop");
- BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(new FileOutputStream(new File(savePathPrexFile, "1.png")));
- //文件逐步写入本地
- BufferedInputStream bufferedInputStream = new BufferedInputStream(new ByteArrayInputStream(imgBytes));
- while ((readLenghth = bufferedInputStream.read(buffer, 0, 1024)) != -1) {//先读出来,保存在buffer数组中
- bufferedOutputStream.write(buffer, 0, readLenghth);//再从buffer中取出来保存到本地
- }
- //关闭缓冲流
- bufferedOutputStream.close();
- bufferedInputStream.close();
- }
- String ocrAccurateBasicUrl = "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic";
- String accessToken = getAuth("taQYPoO9deuODxEsltkGFyqG", "lm9laVGOO14cH3sfWvtwcL1GtEC8rwS9");
- headerParams.clear();
- headerParams.put("Content-Type", "application/x-www-form-urlencoded");
- params.clear();
- params.put("image", Base64.getEncoder().encodeToString(imgBytes));
- Connection.Response ocrResponse = JsoupUtil.requestBody(ocrAccurateBasicUrl.concat("?access_token=").concat(accessToken),
- JsoupUtil.HTTP_POST, Proxy.NO_PROXY, headerParams, params);
- JSONObject crAccurateBasicResult = JSONObject.parseObject(ocrResponse.body());
- String seccodeverify = crAccurateBasicResult.getJSONArray("words_result").getJSONObject(0).getString("words");
- // 1.5 https://www.javbus.com/forum/member.php
- StringBuffer sbParams = new StringBuffer();
- sbParams.append("?mod=logging&action=login&loginsubmit=yes&handlekey=login&loginhash=").append(key1).append("&inajax=1");
- headerParams.clear();
- headerParams.put("Content-Type", "application/x-www-form-urlencoded");
- params.clear();
- params.put("formhash", key3);
- params.put("referer", "https://www.javbus.com/forum/forum.php");
- params.put("loginfield", "username");
- params.put("username", "Tujide.lv");
- params.put("password", "Lzq920165830.");
- params.put("questionid", "0");
- params.put("answer", "");
- params.put("seccodehash", key2);
- params.put("seccodemodid", "member::logging");
- params.put("seccodeverify", seccodeverify);
- System.out.println(sbParams);
- System.out.println(params);
- if (cookieKey4Seccode != "") {
- forumCookies.put("existmag", "mag");
- forumCookies.put(cookieKey4Seccode, imgCookies.get(cookieKey4Seccode));
- System.out.println("forumCookies2=" + forumCookies);
- }
- Connection.Response loginResponse = JsoupUtil.requestBody(memberUrl.concat(sbParams.toString()), JsoupUtil.HTTP_POST, proxy, forumCookies, headerParams, params);
- Map<String, String> loginCookies = loginResponse.cookies();
- System.out.println("loginCookies=" + loginCookies);
- System.out.println(loginResponse.body());
- for (Map.Entry<String, String> loginCookie : loginCookies.entrySet()) {
- if (loginCookie.getKey().contains("ulastactivity")) {
- forumCookies.put(loginCookie.getKey(), loginCookie.getValue());
- } else if (loginCookie.getKey().contains("auth")) {
- forumCookies.put(loginCookie.getKey(), loginCookie.getValue());
- } else if (loginCookie.getKey().contains("lastcheckfeed")) {
- forumCookies.put(loginCookie.getKey(), loginCookie.getValue());
- } else if (loginCookie.getKey().contains("lip")) {
- forumCookies.put(loginCookie.getKey(), loginCookie.getValue());
- }
- }
- System.out.println("loginCookies2=" + forumCookies);
- // 2 获取个人资料
- Connection.Response memberInfoResponse = JsoupUtil.requestBody("https://www.javbus.com/forum/?355292", JsoupUtil.HTTP_GET, proxy, forumCookies, null, null);
- System.out.println("memberInfoCookies=" + memberInfoResponse.cookies());
- System.out.println(memberInfoResponse.body());
- }
- public static String getAuth(String ak, String sk) {
- // 获取token地址
- String authHost = "https://aip.baidubce.com/oauth/2.0/token?";
- String getAccessTokenUrl = authHost
- // 1. grant_type为固定参数
- + "grant_type=client_credentials"
- // 2. 官网获取的 API Key
- + "&client_id=" + ak
- // 3. 官网获取的 Secret Key
- + "&client_secret=" + sk;
- try {
- URL realUrl = new URL(getAccessTokenUrl);
- // 打开和URL之间的连接
- HttpURLConnection connection = (HttpURLConnection) realUrl.openConnection();
- connection.setRequestMethod("GET");
- connection.connect();
- // 获取所有响应头字段
- Map<String, List<String>> map = connection.getHeaderFields();
- // 遍历所有的响应头字段
- for (String key : map.keySet()) {
- System.err.println(key + "--->" + map.get(key));
- }
- // 定义 BufferedReader输入流来读取URL的响应
- BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()));
- String result = "";
- String line;
- while ((line = in.readLine()) != null) {
- result += line;
- }
- /**
- * 返回结果示例
- */
- System.err.println("result:" + result);
- JSONObject jsonObject = JSONObject.parseObject(result);
- String access_token = jsonObject.getString("access_token");
- return access_token;
- } catch (Exception e) {
- System.err.print("获取token失败!");
- e.printStackTrace(System.err);
- }
- return null;
- }
- }
|