Sfoglia il codice sorgente

add:fb爬取未成功待解决v1

lvzhiqiang 3 anni fa
parent
commit
3d6d4689eb

+ 20 - 0
src/main/java/top/lvzhiqiang/controller/CrawlerController.java

@@ -5,6 +5,7 @@ import org.springframework.web.bind.annotation.RequestMapping;
 import org.springframework.web.bind.annotation.ResponseBody;
 import top.lvzhiqiang.dto.R;
 import top.lvzhiqiang.exception.ParameterException;
+import top.lvzhiqiang.service.Crawler4FacebookService;
 import top.lvzhiqiang.service.CrawlerService;
 import top.lvzhiqiang.util.StringUtils;
 
@@ -22,6 +23,8 @@ public class CrawlerController {
 
     @Resource
     private CrawlerService crawlerService;
+    @Resource
+    private Crawler4FacebookService crawler4FacebookService;
 
     /**
      * findXiaoeknowCourse
@@ -153,4 +156,21 @@ public class CrawlerController {
 
         return crawlerService.findIkoaMovie4VideoInfo(bitrate, order);
     }
+
+    /**
+     * jsoupFacebookGroupMemberInfo
+     *
+     * @author lvzhiqiang
+     * 2022/10/11 16:11
+     */
+    @RequestMapping("/jsoupFacebookGroupMemberInfo")
+    @ResponseBody
+    public String jsoupFacebookGroupMemberInfo(String email, String password, String url, String limit) throws Exception {
+        if (StringUtils.isEmpty(email) || StringUtils.isEmpty(password) || StringUtils.isEmpty(url)) {
+            throw new ParameterException("email|password|url不能为空");
+        }
+
+        crawler4FacebookService.jsoupFacebookGroupMemberInfo(email, password, url, limit);
+        return "success";
+    }
 }

+ 12 - 0
src/main/java/top/lvzhiqiang/service/Crawler4FacebookService.java

@@ -0,0 +1,12 @@
+package top.lvzhiqiang.service;
+
+/**
+ * Crawler Facebook Service
+ *
+ * @author lvzhiqiang
+ * 2022/10/11 16:11
+ */
+public interface Crawler4FacebookService {
+
+    void jsoupFacebookGroupMemberInfo(String email, String password, String url, String limit) throws Exception;
+}

+ 165 - 0
src/main/java/top/lvzhiqiang/service/impl/Crawler4FacebookServiceImpl.java

@@ -0,0 +1,165 @@
+package top.lvzhiqiang.service.impl;
+
+import lombok.extern.slf4j.Slf4j;
+import org.jsoup.Connection;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.stereotype.Service;
+import top.lvzhiqiang.mapper.DicCodeMapper;
+import top.lvzhiqiang.service.Crawler4FacebookService;
+
+import javax.annotation.Resource;
+import javax.net.ssl.*;
+import java.net.InetSocketAddress;
+import java.net.Proxy;
+import java.security.SecureRandom;
+import java.security.cert.CertificateException;
+import java.security.cert.X509Certificate;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Random;
+
+/**
+ * Crawler Facebook ServiceImpl
+ *
+ * @author lvzhiqiang
+ * 2022/10/11 16:11
+ */
+@Service
+@Slf4j
+public class Crawler4FacebookServiceImpl implements Crawler4FacebookService {
+
+    @Resource
+    private DicCodeMapper dicCodeMapper;
+
+    @Value("${spring.profiles.active}")
+    private String env;
+
+    protected final static int TIMEOUT_CONNECTION = 60000;
+    protected final static String HTTP_GET = "GET";
+    protected final static String HTTP_POST = "POST";
+    protected static Map<String, String> cookies = null;
+    protected static String userAgent = " Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11";
+
+    @Override
+    public void jsoupFacebookGroupMemberInfo(String email, String password, String url, String limit) throws Exception {
+        String loginUrl = "https://www.facebook.com/login/device-based/regular/login/?login_attempt=1";
+        loginUrl = "https://www.facebook.com/login.php?login_attempt=1";
+        loginUrl = "https://www.facebook.com/login?privacy_mutation_token=eyJ0eXBlIjowLCJjcmVhdGlvbl90aW1lIjoxNjY1NTQwMzA5LCJjYWxsc2l0ZV9pZCI6MzgxMjI5MDc5NTc1OTQ2fQ%3D%3D";
+        loginUrl = "https://www.facebook.com/login/device-based/regular/login/?login_attempt=1&lwv=101";
+        loginUrl = "https://m.facebook.com/login/async/?refsrc=https%3A%2F%2Fm.facebook.com%2F&lwv=100";
+
+        // 代理及TOKEN设置
+        Proxy proxy;
+        if ("dev".equals(env)) {
+            proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("127.0.0.1", 1080));
+        } else {
+            proxy = Proxy.NO_PROXY;
+        }
+        // 模拟登录
+        requestBody(loginUrl, HTTP_GET, proxy, null); //fetching cookie and saving
+
+        //trustEveryone();
+
+        Map<String, String> loginParams = new HashMap<>();
+        loginParams.put("email", email);
+        loginParams.put("pass", password);
+        Connection.Response loginResponse = requestBody(loginUrl, HTTP_POST, proxy, loginParams);
+        String userId = loginResponse.cookies().get("c_user"); // current login userId
+    }
+
+    private Connection getConnection(String url, Proxy proxy) {
+        return Jsoup.connect(url)
+                .timeout(TIMEOUT_CONNECTION)
+                .proxy(proxy)
+                .userAgent(getUserAgent())
+                .followRedirects(true)
+                .ignoreContentType(true);
+    }
+
+    protected Document requestDocument(String url, String httpMethod, Proxy proxy, Map<String, String> data) throws Exception {
+        Connection connection = getConnection(url, proxy);
+        if (data != null && data.size() > 0) {
+            connection.data(data);
+        }
+        if (cookies != null) {
+            connection.cookies(cookies);
+        }
+        Document resultDocument = HTTP_POST.equalsIgnoreCase(httpMethod) ? connection.post() : connection.get();
+        return resultDocument;
+    }
+
+    protected Connection.Response requestBody(String url, String httpMethod, Proxy proxy, Map<String, String> data) throws Exception {
+        Connection connection = getConnection(url, proxy);
+        if (data != null && data.size() > 0) {
+            connection.data(data);
+        }
+        if (cookies != null) {
+            connection.cookies(cookies);
+        }
+        connection.method(HTTP_POST.equalsIgnoreCase(httpMethod) ? Connection.Method.POST : Connection.Method.GET);
+        Connection.Response res = connection.execute();
+        if (res.cookies() != null && !res.cookies().isEmpty()) {
+            cookies = res.cookies();
+        }
+        return res;
+    }
+
+    private String getUserAgent() {
+        Random r = new Random();
+        String[] ua = {"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
+                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36 OPR/37.0.2178.32",
+                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
+                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
+                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586",
+                "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
+                "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
+                "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",
+                "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)",
+                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.3 Safari/537.36",
+                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.277.400 QQBrowser/9.4.7658.400",
+                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 UBrowser/5.6.12150.8 Safari/537.36",
+                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0",
+                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 TheWorld 7",
+                "Mozilla/5.0 (Windows NT 6.1; W…) Gecko/20100101 Firefox/60.0",
+                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"};
+        int i = r.nextInt(15);
+        return ua[i];
+    }
+
+    /**
+     * 信任任何站点
+     */
+    public void trustEveryone() {
+        try {
+            HttpsURLConnection.setDefaultHostnameVerifier(new HostnameVerifier() {
+                @Override
+                public boolean verify(String hostname, SSLSession session) {
+                    return true;
+                }
+            });
+
+            SSLContext context = SSLContext.getInstance("TLS");
+            context.init(null, new X509TrustManager[]{new X509TrustManager() {
+                @Override
+                public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
+                }
+
+                @Override
+                public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
+                }
+
+                @Override
+                public X509Certificate[] getAcceptedIssuers() {
+                    return new X509Certificate[0];
+                }
+            }}, new SecureRandom());
+            HttpsURLConnection.setDefaultSSLSocketFactory(context.getSocketFactory());
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+}
+
+

+ 15 - 0
src/main/resources/static/crawler.html

@@ -158,6 +158,21 @@
             <input type="submit" value="提交">
         </form>
     </div>
+    <hr/>
+    <div style="margin-right:20px;">
+        <span class="font">jsoupFacebookGroupMemberInfo</span>
+        <form method="post" action="bg/crawler/jsoupFacebookGroupMemberInfo">
+            <span>Email</span>
+            <input type="text" name="email" placeholder="Facebook帐号" style="width: 100px;"/>
+            <span>Password</span>
+            <input type="password" name="password" placeholder="Facebook密码" style="width: 100px;"/>
+            <span>Group Id / URL</span>
+            <input type="text" name="url" placeholder="群组的唯一标示或者主页地址,多个群组用英文分号隔开" style="width: 350px;" />
+            <span>Limit</span>
+            <input type="text" name="limit" placeholder="限制爬取群组中的成员数,不填或者小于等于0代表无限制" style="width: 350px;"/>
+            <input type="submit" value="提交">
+        </form>
+    </div>
 </div>
 </body>
 </html>