一、这是发送手机验证码的模板:
import requests import json class YunPian(object): def __init__(self, api_key): self.api_key = api_key self.single_send_url = "https://sms.yunpian.com/v2/" def send_sms(self, code, mobile): params = { "apikey": self.api_key, "mobile": mobile, "text": "您的验证码是{code},如非本人操作,请忽略本信息。".format(code = code) } response = requests.post(self.single_send_url,data=params) re_dict = json.loads(response.text) print(re_dict) if __name__ == "__main__": yun_pian = YunPian("xxxx") yun_pian.send_sms("2007","18782902356")
这里又有一个坑,要留意,就是response = requests.post(self.single_send_url,data=params)后面的参数要注意,如果是用的get方法,后面应该要用params=XXX这样的形式,比如params=payload。
备注:
在这里有用到。
二、网页编码
1.编码
import requests html = requests.get('https://www.baidu.com') html.encoding = 'utf-8' print (html.text)
2.requests自动识别编码
response = requests.get("http://www.x.com") #自动识别解码 response.encoding = response.apparent_encoding
3.request乱码
使用“html.text.encode('utf-8')”也打印不出结果。
乱码的解决方案
其实前面3行就可以了。
四、requests的返回
import requests response = requests.get("https://www.baidu.com", data=None, timeout=10) # #打印出服务器响应的header信息 # print("打印出服务器响应的header信息:",response.headers) # #打印出服务器响应的状态码,结果为<Response [200]> print("打印出服务器响应的状态码:",response.status_code) print(response) print("打印出request",response.request) print("打印出请求的cookie:",response.cookies) print("判断请求是否ok:",response.ok)
五、request.get
def get_url_content(self,url, max_try_number=5): #封装的requests.get try_num = 0 while True: try: return requests.get(url, timeout=5) except Exception as http_err: print(url, "抓取报错", http_err) try_num += 1 if try_num >= max_try_number: print("尝试失败次数过多,放弃尝试") return None
如何处理request返回的报错信息?
from bs4 import BeautifulSoup url = '<div class="caller_ref">这是div</div> <a href="/tomasi/cardio/vgh/SPsdeGBHH">超链接</a>' soup = BeautifulSoup(url,'html.parser') print("soup的长度是:{}".format(len(soup))) if soup.title is None: print("没有title,title is None") content = soup.find('div', {'class': 'caller_ref1'}) if content is None: print("没有找到div的值,div返回None")
soup的长度是:3
没有title,title is None
没有找到div的值,div返回None
所以,用len可以判断soup(不能用None),不过即使soup为空白,程序也不会报错。
用none可以判断title,content返回是否为空。
六、添加代理
1.需要在http://h.zhimaruanjian.com/getapi/ 的白名单添加本机ip。u:138 Ps:F14
2.相关参数
获取数量选1个。
3.获取代理ip的代码
另外,发现采集国外网站的时候,用自己的ip采集很慢,有时采不到,用代理IP反而快很多。
import requests import json # 添加http代理,芝麻代理为例 headers = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': '', 'DNT': '1', 'Host': 'www.baidu.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36', } # 获取代理 def get_http_ip(): url = "http://webapi.http.zhimacangku.com/getip?num=1&type=1&pro=&city=0&yys=0&port=11&time=1&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1®ions=" html = requests.get(url, timeout=30) ip = html.content.strip() return bytes.decode(ip) # def get_http_ip(num): # #使用json的方式取得代理ip # url= "http://webapi.http.zhimacangku.com/getip?num={}&type=2&pro=&city=0&yys=0&port=1&time=1&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1®ions=".format(num) # html = requests.get(url, timeout=30).text # json_str = json.loads(html).get('data') # print(json_str) proxy_ip = get_http_ip() proxies = { "http": "http://{ip}".format(ip=proxy_ip), "https": "http://{ip}".format(ip=proxy_ip), } import requests html = requests.get('https://www.baidu.com/baidu?tn=dealio_dg&wd=ip', headers=headers, proxies=proxies) html.encoding = 'utf-8' content = html.text import re ip = re.search(r'class="c-gap-right">本机IP: (.*?)</span>',content) ip.group(1) print(ip.group(1))
3.效果展示
4.代理池方案
https://blog.csdn.net/qq_37978800/article/details/108182356
七、采集sogou
(一)效果
(二)代码
发现加了cookie之后,不需要换IP,采集100页列表页都没有问题。
# -*- coding: utf-8 -*- import requests from lxml import etree import random import time headers = { "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding":"gzip, deflate, br", "Accept-Language":"zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", "Connection":"keep-alive", "Cookie":"IPLOC=CN4401; SUID=3D88E578302EA00A5EE0A5C700091A7A; CXID=2D2E573D28DBF81F0D5EE82AE194B591; wuid=AAFfVMjnMQAAAAqHDEAx3wAAkwA=; ssuid=4566914326; usid=MQ7Cat9I6sYTA4pv; SNUID=BCA801FC898D3212BEC05BC4892BFA83; SUV=00323ADCDF4A72625F8983AF6ABFA771; FREQUENCY=1602847663829_2; front_screen_resolution=1920*1080; front_screen_dpi=1; sw_uuid=1992113152; ad=wkllllllll2k46ovlllllplWvPwlllllWnQyZyllll9lllllxklll5@@@@@@@@@@; ABTEST=0|1613645893|v1; weixinIndexVisited=1; ld=nkllllllll2kaum2lllllplmziwlllllNYkP2yllllwlll ppinfo=fa4414ad0d; passport=5|1613732661|1614942261|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo1OkJvd2VufGNydDoxMDoxNjEzNzMyNjYxfHJlZm5pY2s6NTpCb3dlbnx1c2VyaWQ6NDQ6bzl0Mmx1Smo1YlMtYk9FRVFMblJMbmxiV3JWMEB3ZWl4aW4uc29odS5jb218|f003b0d15e|JeJkSzFeHtPrilwydblkqBT-ATqXzrachzaryN0RBTvCHeRXs03-K4fxZ1wfTRU027D4KSjaaG6Su6s2zp9AAjpjKwloLvQWBr3lUWVleKg8RszyfMm5Zpc6HqBQX7GfNyFl-p3KXcKdzNF3HHLxU2HJWnXed7AWY6i-TtbGvm8; sgid=05-49569309-AWAvmzW7J4aNYZkPO1K8yA8; ppmdig=1613732663000000b8e1c27bc97c631f915fcdca3f8f2c32", "Host":"weixin.sogou.com", "Referer":"https://weixin.sogou.com", "Upgrade-Insecure-Requests":"1", "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0", } def get_proxy(): url = "http://webapi.http.zhimacangku.com/getip?num=1&type=1&pro=&city=0&yys=0&port=1&time=1&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1®ions=" try: response = requests.get(url, timeout=30) if response.status_code == 200: ip = response.content.strip() return bytes.decode(ip) except ConnectionError: return None url2 = "https://weixin.sogou.com/weixin?query=%E9%A3%8E%E6%99%AF&type=2&page=%d"+"&ie=utf8" ##定义通用翻页url for pg in range(1,4): new_url = "https://weixin.sogou.com/weixin?query=%E9%A3%8E%E6%99%AF&type=2&page={}".format(pg)+"&ie=utf8" # print(new_url) proxy = get_proxy() proxies = { 'http': 'http://' + proxy } print("正在使用的代理是:",proxies) html2 = requests.get(url=new_url,headers=headers,proxies=proxies) print(html2.status_code,"-----"+str(pg)+new_url) tree2 = etree.HTML(html2.text) title = tree2.xpath("//div[@class='txt-box']/h3/a//text()") print(title) # nr = tree2.xpath("//div[@class='txt-box']/p//text()") # print(nr) print("-"*80) time.sleep(1) #因为代理网站1秒只能获取一个IP,所以这里需要暂停1秒。