利用Google Dork和SSL绕过实现自动化搜索的技术解析

本文详细介绍了如何通过Python脚本实现Google Dork的自动化搜索,包括代理获取与筛选、用户代理轮换、SSL绕过技术,以及多线程处理提高搜索效率的方法。

Google Dork SSL Bypassing For Auto Dorking

概述

本文介绍了一个Python脚本,用于自动化执行Google Dork搜索,并通过代理和用户代理轮换技术绕过SSL限制,提高搜索效率和匿名性。

代码解析

导入必要的库

1
2
3
4
5
6
7
8
import os
import time
import random
import argparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup
import requests
from termcolor import colored

获取代理列表

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
def get_proxies():
    proxies = []
    if not os.path.exists("proxies.txt"):
        url = "https://api.proxyscrape.com/v2/?request=getproxies&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all&limit=5000"
        proxies = requests.get(url).text.split("\n")
        with open("proxies.txt", "w") as f:
            f.write("\n".join(proxies))
    else:
        with open("proxies.txt", "r") as f:
            proxies = f.read().split("\n")
    return proxies

测试代理可用性

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
def test_proxy(proxy, user_agent, verbose):
    test_url = "https://bing.com"
    headers = {"User-Agent": user_agent}
    try:
        proxies = {"http": f"http://{proxy}", "https": f"http://{proxy}"}
        response = requests.get(test_url, headers=headers, proxies=proxies, timeout=3)
        print(colored(f"Scraping good proxies...", "blue"))
        if response.status_code == 200:
            print(colored(f"Good proxy found: {proxy}", "green"))
            return True
    except requests.exceptions.ConnectTimeout:
        if verbose:
            print(colored(f"Connection timeout for proxy: {proxy}", "red"))
    except requests.exceptions.ProxyError:
        if verbose:
            print(colored(f"Proxy error for proxy: {proxy}", "red"))
    except requests.exceptions.RequestException as e:
        if verbose:
            print(colored(f"Request exception for proxy: {proxy}, error: {e}", "red"))
    return False

筛选可用代理

1
2
3
4
5
6
7
8
9
def filter_working_proxies(proxies, user_agents, verbose):
    working_proxies = []
    user_agent = random.choice(user_agents)
    with ThreadPoolExecutor(max_workers=50) as executor:
        futures_to_proxies = {executor.submit(test_proxy, proxy, user_agent, verbose): proxy for proxy in proxies}
        for future in as_completed(futures_to_proxies):
            if future.result():
                working_proxies.append(futures_to_proxies[future])
    return working_proxies

获取用户代理列表

1
2
3
def get_user_agents():
    with open("useragents.txt", "r") as f:
        return f.read().split("\n")

执行Google搜索

1
2
3
4
5
6
7
def google_search(query, user_agent, proxy):
    url = f"https://www.google.com/search?q={query}"
    headers = {"User-Agent": user_agent}
    proxies = {"http": f"http://{proxy}", "https": f"http://{proxy}"}
    response = requests.get(url, headers=headers, proxies=proxies, timeout=10)
    soup = BeautifulSoup(response.text, "html.parser")
    return [result["href"] for result in soup.select(".yuRUbf a")]

搜索Dork

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
def search_dork(dork, proxies, user_agents, verbose, max_retries=3, backoff_factor=1.0):
    print(colored(f"Searching for dork: {dork}", "yellow"))

    def try_search_dork(dork, proxy, user_agent):
        try:
            results = google_search(dork, user_agent, proxy)
            return results
        except requests.exceptions.RequestException as e:
            if verbose:
                print(colored(f"Error with proxy {proxy}: {e}, rotating proxy...", "magenta"))
            return None

    retries = 0
    while retries <= max_retries:
        proxy = random.choice(proxies)
        user_agent = random.choice(user_agents)
        results = try_search_dork(dork, proxy, user_agent)

        if results is not None:
            if results:
                with open(f"results/{dork}_results.txt", "w") as f:
                    f.write("\n".join(results[:20]))
                print(colored(f"Saved top 20 results for dork '{dork}'", "green"))
            else:
                print(colored(f"No results found for dork '{dork}'", "red"))
            break

        retries += 1
        time.sleep(backoff_factor * (2 ** (retries - 1)) + random.uniform(1, 5))

主函数

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-v", "--verbose", help="Display errors with proxies.", action="store_true")
    args = parser.parse_args()

    dorks = []
    with open("dorks.txt", "r") as f:
        dorks = f.read().split("\n")

    user_agents = get_user_agents()
    proxies = filter_working_proxies(get_proxies(), user_agents, args.verbose)

    if not os.path.exists("results"):
        os.makedirs("results")

    with ThreadPoolExecutor(max_workers=20) as executor:
        futures = {executor.submit(search_dork, dork, proxies, user_agents, args.verbose): dork for dork in dorks}
        for future in as_completed(futures):
            future.result()

if __name__ == "__main__":
    main()

技术要点

  1. 代理管理:从外部API获取代理列表,并测试其可用性。
  2. 用户代理轮换:使用随机用户代理避免被检测。
  3. 多线程处理:使用ThreadPoolExecutor提高代理测试和搜索效率。
  4. 错误处理:实现了重试机制和指数退避策略,处理网络请求失败。
  5. 结果保存:将搜索结果保存到本地文件,便于后续分析。

使用说明

  1. 准备dorks.txt文件,包含要搜索的Google Dork。
  2. 准备useragents.txt文件,包含用户代理列表。
  3. 运行脚本,可选-v参数显示详细错误信息。

该脚本通过自动化代理管理和用户代理轮换,有效绕过Google的访问限制,实现高效的自动化Dork搜索。

comments powered by Disqus
使用 Hugo 构建
主题 StackJimmy 设计