以下python代码是基于python3.13版,后续如有其他版本升级可自行调整。
注意事项:需要在代码同级目录下建一个url.txt文档,将需要扫描的地址填入,每个地址单独一行。扫描完之后会生成一个excel表文档,一般会在代码的同级目录下,以当前日期和时间命名。
import re import urllib3 import os import yagmail import requests import logging from concurrent.futures import ThreadPoolExecutor import time import threading import xlwt import xlrd import socket from xlutils.copy import copy from urllib3.exceptions import InsecureRequestWarning # 禁用安全请求警告 urllib3.disable_warnings(InsecureRequestWarning) # 获取状态码、标题 header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.6613.85 Safari/537.36', } def get_ip(url): url = url.strip('\n').replace('http://', '').replace('https://', '') myaddr = socket.getaddrinfo(url, 'http') return myaddr[0][4][0] def get_codetitle(url): code = "无法访问" title = " " resurl = " " description = " " keywords = " " try: res = requests.get(url, headers=header, verify=False, allow_redirects=True, timeout=(3, 12)) res.encoding = res.apparent_encoding code = res.status_code title = re.findall(r"(?<=\<title\>)(?:.|\n)+?(?=\<)", res.text, re.IGNORECASE)[0].strip() description = re.findall(r"(?<=\<meta name=\"description\" content=\")(?:.|\n)+?(?=\" \/\>|\"\/\>)", res.text, re.IGNORECASE)[0].strip() keywords = re.findall(r"(?<=\<meta name=\"keywords\" content=\")(?:.|\n)+?(?=\" \/\>|\"\/\>)", res.text, re.IGNORECASE)[0].strip() resurl = res.url except requests.RequestException as error: logging.error('%s网址无效或者IP被封锁: %s', url, error) try: ip = get_ip(url) except socket.error as error: logging.error('获取IP失败: %s', error) ip = 'null' return resurl, code, title, description, keywords, ip def write(url): codetitle = get_codetitle(url) resurl, code, title, description, keywords, ip = map(str, codetitle) logging.info('%s | %s | %s | %s', url, code, title, ip) with lock: with xlrd.open_workbook(os.path.join(path, savefilename + '.xls')) as word_book: sheets = word_book.sheet_names() work_sheet = word_book.sheet_by_name(sheets[0]) old_rows = work_sheet.nrows new_work_book = copy(word_book) new_sheet = new_work_book.get_sheet(0) i = old_rows new_sheet.write(i, 0, url) new_sheet.write(i, 1, resurl) new_sheet.write(i, 2, code) new_sheet.write(i, 3, title) new_sheet.write(i, 4, description) new_sheet.write(i, 5, keywords) new_sheet.write(i, 6, ip) new_work_book.save(os.path.join(path, savefilename + '.xls')) def process_urls(input_file, output_file): with open(input_file, "r") as f: lines = f.readlines() with open(output_file, "w") as f2: for line in lines: line = line.strip('\n').strip() if not line.startswith('http://') and not line.startswith('https://'): f2.write('http://' + line + '\n') else: f2.write(line + '\n') def send_email(duration): try: yag = yagmail.SMTP(user=os.getenv("EMAIL_USER"), password=os.getenv("EMAIL_PASS"), host='smtp.qq.com', port=465) contents = [f'TDK获取时间:{duration}秒'] subject = 'TDK获取完成通知' receiver = ["705276383@qq.com"] yag.send(to=receiver, subject=subject, contents=contents) yag.close() except Exception as error: logging.error('发送邮件失败: %s', error) if __name__ == "__main__": n = 0 path = os.getcwd() logging.captureWarnings(True) logging.basicConfig(level=logging.INFO) requests.packages.urllib3.disable_warnings(InsecureRequestWarning) start = time.time() lock = threading.Lock() savefilename = time.strftime("%Y-%m-%d %H.%M.%S") myxls = xlwt.Workbook() sheet1 = myxls.add_sheet(u'title', cell_overwrite_ok=True) sheet1.write(0, 0, "源地址") sheet1.write(0, 1, "跳转地址") sheet1.write(0, 2, "状态码") sheet1.write(0, 3, "标题") sheet1.write(0, 4, "描述") sheet1.write(0, 5, "关键词") sheet1.write(0, 6, "IP") myxls.save(os.path.join(path, savefilename + '.xls')) process_urls(os.path.join(path, "url.txt"), os.path.join(path, "url-run.txt")) with open(os.path.join(path, 'url-run.txt'), 'r', encoding='utf-8') as f: urls_data = [data.strip().strip('\\') for data in f] with ThreadPoolExecutor(max_workers=100) as executor: for url in urls_data: executor.submit(write, url=url) end = time.time() logging.info("总耗时: %s 秒", end - start) send_email(end - start)