前言

封装自己的http请求库对我来说还是蛮有用处的,在实现URL请求或爬虫的时候,自定义库可以让我们的请求被封的几率更小,优点如下:

  1. 使用随机User-Agent来完成请求,使UA头不仅仅是单调的python

  2. 给请求中添加了CLIENT-IP、X-FORWARDED-FOR来伪装客户端

  3. 给请求中添加了Referer头来增加请求可信度

  4. 给请求头添加了Host

  5. 禁用了默认证书验证

  6. 禁用了自动跟随跳转

  7. 等其他设置

初始化请求配置

Import部分

from aiohttp import ClientSession, ClientResponse, ClientTimeout, FormData
from urllib.parse import unquote, urlparse
from requests_html import HTMLSession as session
from urllib3 import disable_warnings
from typing import Union, Dict
from loguru import logger
import re

disable_warnings()
Timeout = 12
  • aiohttp 库用于处理异步请求

  • requests_html 库用于处理常规请求

  • urllib3 库用于禁用证书不可信引发的报错

  • urllib.parse 库用于格式化url,提取host

默认请求头

    def header(cls, switch: str = '', types: str = '', api: str = '', yq: bool = False) -> dict:  # 获取请求头
        ip = f"101.{random.randint(1, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}"
        _u = [
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (HTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
            'Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (HTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11',
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (HTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36',
            'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (HTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (HTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (HTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (HTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (HTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (HTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1',
            'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (HTML, like Gecko) Chrome/19.77.34.5 Safari/537.1',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (HTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (HTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5',
            'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (HTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5',
            'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (HTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
            'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (HTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
            'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (HTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6',
            'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (HTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
            'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (HTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
            'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (HTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3',
        ]
        header = {
            'User-Agent': random.choice(_u),
            "CLIENT-IP": ip,
            "X-FORWARDED-FOR": ip,
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9",
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
        }
        if switch == 'Bing': header.update({'Host': 'cn.bing.com', 'Referer': 'https://www.bing.com'})
        if types == "json": header.update({'Content-Type': 'application/json'})
        if api: header.update({"Host": api.split('://')[-1].split('/')[0], "Referer": api})
        if yq: header.update({
            "User-Agent": "Baiduspider+(+http://www.baidu.com/search/spider.htm);google|baiduspider|baidu|spider|sogou|bing|yahoo|soso|sosospider|360spider|youdao|jikeSpider;)",
        })
        return header

传入参数

  • switch:可以自定义,这里设计的目的是给搜索引擎使用

  • types:用于声明json头

  • api:用于传入url并自定义Host头和Referer

  • yq:用于判断是否使用搜索引擎爬虫的头,一般用于检测目标站点是否存在面向搜索引擎的暗链

完善请求头

def g_headers(url: str, header: dict = None):
    _header = header()
    try:
        ufo = urlparse(url)
        _header.update({"Host": ufo.netloc})
        _header.update({"Referer": ufo.scheme + '://' + ufo.netloc})
    except ValueError as err:
        print(f"Error parsing URL: {url}, Error: {err}")
    if header: _header.update(header)
    return _header
  • url:需要传入url用于添加host和Referer

  • header:自定义header头,覆盖默认请求头

GET请求

自定义GET请求

def r_get(
        url: str,
        timeout: int = Timeout,
        allow_redirects: bool = False,
        verify: bool = False,
        header: dict = None,
        params=None,
        stream: bool = True,
        *args
):
    r"""
    <Response [200]>
    :return: :class:`Response <Response>` object
    :rtype: requests.Response
    """
    url = format_url(url)
    header = g_headers(url=url, header=header)

    try:
        with session() as S:
            return S.request(method='get', url=url, allow_redirects=allow_redirects, timeout=timeout, verify=verify,
                             headers=header, params=params, stream=stream, *args)
    except Exception as e:
        logger.error(f"Error: {e},url:{url}")
        return None

自定义异步GET请求

async def async_get(
        url: str,
        timeout: int = Timeout,
        allow_redirects: bool = False,
        verify: bool = False,
        header: Dict[str, str] = None,
        params: dict = None
) -> Union[ClientResponse, None]:
    """
    发送异步HTTP GET请求

    :param url: 请求的URL
    :param timeout: 请求超时时间(秒)
    :param allow_redirects: 是否允许重定向
    :param verify: 是否验证SSL证书(默认为True)
    :param header: 请求头字典
    :param params: URL参数
    :return: aiohttp.ClientResponse对象或None(如果发生异常)
    """

    url = format_url(url)
    header = g_headers(url=url, header=header)
    try:
        async with ClientSession(trust_env=verify, timeout=ClientTimeout(total=timeout), headers=header) as S:
            async with S.get(url=url, allow_redirects=allow_redirects, params=params, ssl=verify) as _:
                # _.get_encoding()
                try:
                    await _.text()
                except:
                    await _.text('gbk')
                return _
    except Exception as e:
        logger.error(f"Error: {e},url:{url}")
        return None

POST请求

自定义POST请求

def r_post(
        url: str,
        data=None,
        json: dict = None,
        files=None,
        timeout: int = Timeout,
        allow_redirects: bool = False,
        verify: bool = False,
        header: dict = None,
        params=None,
        stream: bool = True,
        *args
):
    """
    Sends a POST request.
    :return: :class:`Response <Response>` object
    :rtype: requests.Response
    """
    url = format_url(url)
    if header is not None and 'Content-Type' not in header and json is None and files is None:
        header.update({"Content-Type": "application/x-www-form-urlencoded"})
    header = g_headers(url=url, header=header)
    try:
        with session() as S:
            return S.request(method='post', url=url, files=files, data=data, json=json, allow_redirects=allow_redirects,
                             timeout=timeout, verify=verify, headers=header, params=params, stream=stream, *args)
    except Exception as e:
        logger.error(f"Error: {e},url:{url}")
        return None

自定义异步POST请求

async def async_post(
        url: str,
        data=None,
        json: dict = None,
        files=None,
        timeout: int = Timeout,
        allow_redirects: bool = False,
        verify: bool = False,
        header: dict = None,
        params=None
) -> Union[ClientResponse, None]:
    """
    发送异步HTTP POST请求

    :param url: 请求的URL
    :param data: 作为请求体的数据(仅当json和files为None时有效)
    :param json: 作为请求体的JSON数据(如果提供,将自动设置Content-Type为application/json)
    :param files: 要上传的文件
    :param timeout: 请求超时时间,默认为无限制(使用aiohttp.ClientTimeout类来定义)
    :param allow_redirects: 是否允许自动处理重定向,默认为False
    :param verify: 是否验证SSL证书,默认为True
    :param header: 请求头字典(建议将参数名改为headers以保持一致性)
    :param params: 附加到URL的查询参数
    :return: aiohttp.ClientResponse对象,如果发生异常则返回None
    """
    url = format_url(url)
    if header is not None and 'Content-Type' not in header and json is None and files is None:
        header.update({"Content-Type": "application/x-www-form-urlencoded"})
    header = g_headers(url=url, header=header)
    form = None
    if files is not None:
        form = FormData()
        for f in files:
            form.add_field(
                name=f,
                filename=files[f][0] if files[f] else '',
                value=files[f][1] if len(files[f]) > 1 else '',
                content_type=files[f][2] if len(files[f]) > 2 else ''
            )
        if data: [form.add_field(k, str(data[k])) for k in data]
    async with ClientSession(trust_env=verify, timeout=ClientTimeout(total=timeout), headers=header) as S:
        try:
            async with S.post(url=url, data=data if form is None else form, json=json,
                              allow_redirects=allow_redirects, params=params, ssl=verify) as _:
                await _.text()
                return _
        except Exception as e:
            logger.error(f"Error: {e},url:{url}")
            return None

PUT请求

自定义PUT请求

def r_put(
        url: str,
        data=None,
        json: dict = None,
        files=None,
        timeout: int = Timeout,
        allow_redirects: bool = False,
        verify: bool = False,
        header: dict = None,
        params=None,
        stream: bool = True,
        *args
):
    """
    Sends a POST request.
    :return: :class:`Response <Response>` object
    :rtype: requests.Response
    """
    url = format_url(url)
    header = g_headers(url=url, header=header)
    try:
        with session() as S:
            return S.request(method='put', url=url, files=files, data=data, json=json, allow_redirects=allow_redirects,
                             timeout=timeout, verify=verify, headers=header, params=params, stream=stream, *args)
    except Exception as e:
        logger.error(f"Error: {e},url:{url}")
        return None

自定义异步PUT请求

async def async_put(url: str, data=None, timeout: int = Timeout, allow_redirects: bool = False, verify: bool = False,
                    header: dict = None) -> Union[ClientResponse, None]:
    """
    发送异步HTTP PUT请求

    :param url: 要发送请求的 URL。
    :param data: 要在请求体中发送的数据。
    :param timeout: 请求的超时时间(秒)。
    :param allow_redirects: 是否允许跟随重定向。
    :param verify: 是否验证 SSL 证书。
    :param header: 与请求一起发送的 HTTP 头部字典。
    :return: 包含响应对象和布尔值的元组,表示请求是否成功。
    :rtype: tuple[aiohttp.ClientResponse, bool]
    """
    url = format_url(url)  # 假设你有一个 format_url 函数来格式化 URL
    header = g_headers(url=url, header=header)  # 假设你有一个 g_headers 函数来设置或合并头部

    try:
        async with ClientSession(timeout=ClientTimeout(total=timeout), trust_env=verify, headers=header) as S:
            async with S.put(url, data=data, allow_redirects=allow_redirects, ssl=verify) as _:
                await _.text()
                return _
    except Exception as e:
        logger.error(f"Error: {e},url:{url}")
        return None

😊恭喜你,你已经有了一个可以默认模拟真人的请求方法了