多线程代理ip切换实例
# -*- coding:utf-8 -*-
"""
多线程代理ip切换实例
"""
from threading import Thread
from queue import Queue
from random import choice
import requests
import re
import time
class Spider(Thread):
def __init__(self,queue:Queue,config:dict):
super(Spider, self).__init__()
self.queue = queue
self.config = config
self.heards = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0'
}
def run(self) -> None:
while True:
try:
url = self.queue.get()
# 提取第一个
proxy = self.config['proxies'][0] if self.config['proxies'] else None
print(proxy)
# 随机方式
# proxy = choice(self.config['proxies']) if self.config['proxies'] else None
source = self.download(url,proxy)
if source is None:
print('获取源码失败')
self.change_proxy(proxy,url)
# print(source)
title = self.get_title(source)
if title == "百度安全验证":
self.change_proxy(proxy,url)
print(title)
finally:
self.queue.task_done()
def change_proxy(self,proxy,url):
if proxy in self.config['proxies']:
self.config['proxies'].remove(proxy)
self.queue.put(url) # 把请求失败的url重新放回队列
proxy = self.config['proxies'].pop(0) if self.config['proxies'] else None
@staticmethod
def get_title(source):
if not isinstance(source,str):
return
return ''.join(re.findall(r'<title>(.*?)</title>',source,flags=re.I))
def download(self,url,proxy=None):
try:
r = requests.get(url,proxies=proxy,headers=self.heards,timeout = 5)
except requests.RequestException as err:
return
r.encoding = self.config['encoding']
return r.text
def get_proxy(config:dict):
while True:
# 如果代理池的ip数超过10,就暂停获取代理IP
if len(config['proxies']) > 10:
continue
proxy_text = fetch_proxy(config['proxy_api'])
if proxy_text is None:
time.sleep(2) # 获取代理ip失败,延迟2秒
continue
proxy_list = re.split(r'\s+',proxy_text,flags=re.I)
for ip in proxy_list:
proxies = {
'http':f'http://{ip}',
'https':f'http://{ip}',
}
config['proxies'].append(proxies)
time.sleep(10)
def fetch_proxy(proxy_api):
try:
r = requests.get(proxy_api,timeout=10)
except requests.RequestException as e:
return
return r.text
if __name__ == '__main__':
spider_config = {
"proxy_api" : "http://ip.ipjldl.com/index.php/api/" ,
"encoding":"utf-8",
"proxies":[],
}
url_queue = Queue()
for i in range(100):
url_queue.put('https://www.baidu.com/baidu?tn=monline_4_dg&ie=utf-8&wd=requests+%E4%BD%BF%E7%94%A8%E4%BB%A3%E7%90%86')
p = Thread(target=get_proxy,args=(spider_config,))
p.daemon = True
p.start()
time.sleep(2)
# get_proxy(spider_config)
for i in range(5):
sp = Spider(url_queue,spider_config)
sp.daemon = True
sp.start()
url_queue.join()
print('运行结束')