当前位置:首页 > 技术杂坛 > 正文内容

多线程代理ip切换实例

zhangchap2年前 (2022-02-17)技术杂坛202
# -*- coding:utf-8 -*-
"""
多线程代理ip切换实例
"""
from threading import Thread
from queue import Queue
from random import choice
import requests
import re
import time

class Spider(Thread):
   def __init__(self,queue:Queue,config:dict):
      super(Spider, self).__init__()
      self.queue = queue
      self.config = config
      self.heards = {
         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0'
      }

   def run(self) -> None:
      while True:
         try:
            url = self.queue.get()
            # 提取第一个
            proxy = self.config['proxies'][0] if self.config['proxies'] else None
            print(proxy)
            # 随机方式
            # proxy = choice(self.config['proxies']) if self.config['proxies'] else None
            source = self.download(url,proxy)
            if source is None:
               print('获取源码失败')
               self.change_proxy(proxy,url)
            # print(source)
            title = self.get_title(source)
            if title == "百度安全验证":
               self.change_proxy(proxy,url)
            print(title)
         finally:
            self.queue.task_done()
   def change_proxy(self,proxy,url):
      if proxy in self.config['proxies']:
         self.config['proxies'].remove(proxy)
      self.queue.put(url)  # 把请求失败的url重新放回队列
      proxy = self.config['proxies'].pop(0) if self.config['proxies'] else None

   @staticmethod
   def get_title(source):
      if not isinstance(source,str):
         return
      return ''.join(re.findall(r'<title>(.*?)</title>',source,flags=re.I))


   def download(self,url,proxy=None):
      try:
         r = requests.get(url,proxies=proxy,headers=self.heards,timeout = 5)
      except requests.RequestException as err:
         return
      r.encoding = self.config['encoding']
      return r.text

def get_proxy(config:dict):
   while True:
      # 如果代理池的ip数超过10,就暂停获取代理IP
      if len(config['proxies']) > 10:
         continue
      proxy_text = fetch_proxy(config['proxy_api'])
      if proxy_text is None:
         time.sleep(2) # 获取代理ip失败,延迟2秒
         continue
      proxy_list = re.split(r'\s+',proxy_text,flags=re.I)
      for ip in proxy_list:
         proxies = {
            'http':f'http://{ip}',
            'https':f'http://{ip}',
         }
         config['proxies'].append(proxies)
      time.sleep(10)


def fetch_proxy(proxy_api):
   try:
      r = requests.get(proxy_api,timeout=10)
   except requests.RequestException as e:
      return
   return r.text



if __name__ == '__main__':
   spider_config = {
      "proxy_api" : "http://ip.ipjldl.com/index.php/api/" ,
      "encoding":"utf-8",
      "proxies":[],
   }
   url_queue = Queue()
   for i in range(100):
      url_queue.put('https://www.baidu.com/baidu?tn=monline_4_dg&ie=utf-8&wd=requests+%E4%BD%BF%E7%94%A8%E4%BB%A3%E7%90%86')

   p = Thread(target=get_proxy,args=(spider_config,))
   p.daemon = True
   p.start()
   time.sleep(2)

   # get_proxy(spider_config)

   for i in range(5):
      sp = Spider(url_queue,spider_config)
      sp.daemon = True
      sp.start()

   url_queue.join()
   print('运行结束')


标签: 代理ip
分享给朋友:

发表评论

访客

◎欢迎参与讨论,请在这里发表您的看法和观点。