前言
封装自己的http请求库对我来说还是蛮有用处的,在实现URL请求或爬虫的时候,自定义库可以让我们的请求被封的几率更小,优点如下:
使用随机User-Agent来完成请求,使UA头不仅仅是单调的python
给请求中添加了CLIENT-IP、X-FORWARDED-FOR来伪装客户端
给请求中添加了Referer头来增加请求可信度
给请求头添加了Host
禁用了默认证书验证
禁用了自动跟随跳转
等其他设置
初始化请求配置
Import部分
from aiohttp import ClientSession, ClientResponse, ClientTimeout, FormData
from urllib.parse import unquote, urlparse
from requests_html import HTMLSession as session
from urllib3 import disable_warnings
from typing import Union, Dict
from loguru import logger
import re
disable_warnings()
Timeout = 12
aiohttp 库用于处理异步请求
requests_html 库用于处理常规请求
urllib3 库用于禁用证书不可信引发的报错
urllib.parse 库用于格式化url,提取host
默认请求头
def header(cls, switch: str = '', types: str = '', api: str = '', yq: bool = False) -> dict: # 获取请求头
ip = f"101.{random.randint(1, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}"
_u = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (HTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
'Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (HTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (HTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (HTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (HTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (HTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (HTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (HTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (HTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (HTML, like Gecko) Chrome/19.77.34.5 Safari/537.1',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (HTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (HTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (HTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (HTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (HTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (HTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (HTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (HTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (HTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3',
]
header = {
'User-Agent': random.choice(_u),
"CLIENT-IP": ip,
"X-FORWARDED-FOR": ip,
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
}
if switch == 'Bing': header.update({'Host': 'cn.bing.com', 'Referer': 'https://www.bing.com'})
if types == "json": header.update({'Content-Type': 'application/json'})
if api: header.update({"Host": api.split('://')[-1].split('/')[0], "Referer": api})
if yq: header.update({
"User-Agent": "Baiduspider+(+http://www.baidu.com/search/spider.htm);google|baiduspider|baidu|spider|sogou|bing|yahoo|soso|sosospider|360spider|youdao|jikeSpider;)",
})
return header
传入参数
switch:可以自定义,这里设计的目的是给搜索引擎使用
types:用于声明json头
api:用于传入url并自定义Host头和Referer
yq:用于判断是否使用搜索引擎爬虫的头,一般用于检测目标站点是否存在面向搜索引擎的暗链
完善请求头
def g_headers(url: str, header: dict = None):
_header = header()
try:
ufo = urlparse(url)
_header.update({"Host": ufo.netloc})
_header.update({"Referer": ufo.scheme + '://' + ufo.netloc})
except ValueError as err:
print(f"Error parsing URL: {url}, Error: {err}")
if header: _header.update(header)
return _header
url:需要传入url用于添加host和Referer
header:自定义header头,覆盖默认请求头
GET请求
自定义GET请求
def r_get(
url: str,
timeout: int = Timeout,
allow_redirects: bool = False,
verify: bool = False,
header: dict = None,
params=None,
stream: bool = True,
*args
):
r"""
<Response [200]>
:return: :class:`Response <Response>` object
:rtype: requests.Response
"""
url = format_url(url)
header = g_headers(url=url, header=header)
try:
with session() as S:
return S.request(method='get', url=url, allow_redirects=allow_redirects, timeout=timeout, verify=verify,
headers=header, params=params, stream=stream, *args)
except Exception as e:
logger.error(f"Error: {e},url:{url}")
return None
自定义异步GET请求
async def async_get(
url: str,
timeout: int = Timeout,
allow_redirects: bool = False,
verify: bool = False,
header: Dict[str, str] = None,
params: dict = None
) -> Union[ClientResponse, None]:
"""
发送异步HTTP GET请求
:param url: 请求的URL
:param timeout: 请求超时时间(秒)
:param allow_redirects: 是否允许重定向
:param verify: 是否验证SSL证书(默认为True)
:param header: 请求头字典
:param params: URL参数
:return: aiohttp.ClientResponse对象或None(如果发生异常)
"""
url = format_url(url)
header = g_headers(url=url, header=header)
try:
async with ClientSession(trust_env=verify, timeout=ClientTimeout(total=timeout), headers=header) as S:
async with S.get(url=url, allow_redirects=allow_redirects, params=params, ssl=verify) as _:
# _.get_encoding()
try:
await _.text()
except:
await _.text('gbk')
return _
except Exception as e:
logger.error(f"Error: {e},url:{url}")
return None
POST请求
自定义POST请求
def r_post(
url: str,
data=None,
json: dict = None,
files=None,
timeout: int = Timeout,
allow_redirects: bool = False,
verify: bool = False,
header: dict = None,
params=None,
stream: bool = True,
*args
):
"""
Sends a POST request.
:return: :class:`Response <Response>` object
:rtype: requests.Response
"""
url = format_url(url)
if header is not None and 'Content-Type' not in header and json is None and files is None:
header.update({"Content-Type": "application/x-www-form-urlencoded"})
header = g_headers(url=url, header=header)
try:
with session() as S:
return S.request(method='post', url=url, files=files, data=data, json=json, allow_redirects=allow_redirects,
timeout=timeout, verify=verify, headers=header, params=params, stream=stream, *args)
except Exception as e:
logger.error(f"Error: {e},url:{url}")
return None
自定义异步POST请求
async def async_post(
url: str,
data=None,
json: dict = None,
files=None,
timeout: int = Timeout,
allow_redirects: bool = False,
verify: bool = False,
header: dict = None,
params=None
) -> Union[ClientResponse, None]:
"""
发送异步HTTP POST请求
:param url: 请求的URL
:param data: 作为请求体的数据(仅当json和files为None时有效)
:param json: 作为请求体的JSON数据(如果提供,将自动设置Content-Type为application/json)
:param files: 要上传的文件
:param timeout: 请求超时时间,默认为无限制(使用aiohttp.ClientTimeout类来定义)
:param allow_redirects: 是否允许自动处理重定向,默认为False
:param verify: 是否验证SSL证书,默认为True
:param header: 请求头字典(建议将参数名改为headers以保持一致性)
:param params: 附加到URL的查询参数
:return: aiohttp.ClientResponse对象,如果发生异常则返回None
"""
url = format_url(url)
if header is not None and 'Content-Type' not in header and json is None and files is None:
header.update({"Content-Type": "application/x-www-form-urlencoded"})
header = g_headers(url=url, header=header)
form = None
if files is not None:
form = FormData()
for f in files:
form.add_field(
name=f,
filename=files[f][0] if files[f] else '',
value=files[f][1] if len(files[f]) > 1 else '',
content_type=files[f][2] if len(files[f]) > 2 else ''
)
if data: [form.add_field(k, str(data[k])) for k in data]
async with ClientSession(trust_env=verify, timeout=ClientTimeout(total=timeout), headers=header) as S:
try:
async with S.post(url=url, data=data if form is None else form, json=json,
allow_redirects=allow_redirects, params=params, ssl=verify) as _:
await _.text()
return _
except Exception as e:
logger.error(f"Error: {e},url:{url}")
return None
PUT请求
自定义PUT请求
def r_put(
url: str,
data=None,
json: dict = None,
files=None,
timeout: int = Timeout,
allow_redirects: bool = False,
verify: bool = False,
header: dict = None,
params=None,
stream: bool = True,
*args
):
"""
Sends a POST request.
:return: :class:`Response <Response>` object
:rtype: requests.Response
"""
url = format_url(url)
header = g_headers(url=url, header=header)
try:
with session() as S:
return S.request(method='put', url=url, files=files, data=data, json=json, allow_redirects=allow_redirects,
timeout=timeout, verify=verify, headers=header, params=params, stream=stream, *args)
except Exception as e:
logger.error(f"Error: {e},url:{url}")
return None
自定义异步PUT请求
async def async_put(url: str, data=None, timeout: int = Timeout, allow_redirects: bool = False, verify: bool = False,
header: dict = None) -> Union[ClientResponse, None]:
"""
发送异步HTTP PUT请求
:param url: 要发送请求的 URL。
:param data: 要在请求体中发送的数据。
:param timeout: 请求的超时时间(秒)。
:param allow_redirects: 是否允许跟随重定向。
:param verify: 是否验证 SSL 证书。
:param header: 与请求一起发送的 HTTP 头部字典。
:return: 包含响应对象和布尔值的元组,表示请求是否成功。
:rtype: tuple[aiohttp.ClientResponse, bool]
"""
url = format_url(url) # 假设你有一个 format_url 函数来格式化 URL
header = g_headers(url=url, header=header) # 假设你有一个 g_headers 函数来设置或合并头部
try:
async with ClientSession(timeout=ClientTimeout(total=timeout), trust_env=verify, headers=header) as S:
async with S.put(url, data=data, allow_redirects=allow_redirects, ssl=verify) as _:
await _.text()
return _
except Exception as e:
logger.error(f"Error: {e},url:{url}")
return None
😊恭喜你,你已经有了一个可以默认模拟真人的请求方法了
评论