Python数据爬取
一.example1包下的
注意:需要在终端安装pip3 install -i https://requests.readthedocs.io/en/latest/ requests
1.Python网络爬虫初探-get请求
import requests
r=requests.get('https://www.baidu.com')
print(r.text)
r1=requests.get('https://www.jd.com')
print(r1.text)
r2=requests.get('https://httpbin.org/ip')
print(r2.text)
2.带参数的请求
import requests
data={"key1":"value1","key2":"value2"}
response=requests.get("http://httpbin.org/get",params=data)
print(response.text)
print(response.status_code)
print(response.headers)
print(response.url)
3.post请求
import requests
data={"name":"测试"}
response=requests.post("http://httpbin.org/post",data=data)
print(response.text)
4.获取响应数据
import requests
#获得图片
# response=requests.get('https://c-ssl.dtstatic.com/uploads/item/202004/17/20200417160907_laicg.thumb.400_0.gif')
# with open('baidu.png','wb') as f:
# f.write(response.content)
#获得json格式的数据
response=requests.get('http://httpbin.org/ip')
data=response.json()
print(data)
print(data['origin'])
print(response.status_code)
5.自定义请求头信息
import requests
header={'user-agent':'baidu/v1'}
response=requests.get('http://www.httpbin.org/get',headers=header)
print(response.request.headers)
print(response.headers)
6.设置超时时间
import requestsresponse=requests.get('http://github.com',timeout=1)#参数2为超时时长,单位是秒,通常情况设置2-3秒即可
print(response.text)
7.查询服务端的设置
#什么是cookie
#所谓的cookie就是服务端存放浏览器中的一些数据,这些数据可以包含访问的历史,历史记录,也可以是唯一标识等,
#获取cookie
import requests
#1.获取cookie
url="http://www.baidu.com"
header={'user-agent':'Mozilla/5.0 (X11; Linux aarch64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 CrKey/1.54.250320 Edg/135.0.0.0'
}
response=requests.get(url,headers=header)
print(response.headers)
print(response.cookies)
print(response.cookies['BAIDUID'])#2.设置cookie
url='http://www.httpbin.org/cookies'
cookies=dict(cookies_are='hello python')
response=requests.get(url,cookies=cookies)
print(response.text)
8.设置代理
import requests
url='http://whois.pconline.com.cn/ipJson.jsp?json=true'
response=requests.get(url)
print(response.text)
#使用代理
proxy={"http":"http://username:abcd1234@haha.xingsuaili.com:10010","https":"http://username:abcd1234@haha.xingsuaili.com:10010"
}
response=requests.get(url,proxies=proxy)
9.证书设置
import requestsresponse=requests.get("https://www.hengxun.com/news/content/1028.html",verify=False)#参数2:verify=False,关闭证书校验,如果有证书,参数2指向证书位置,如verify='/path/to/certfile'
10.综合案例
import requestsheader={'user-agent':'Mozilla/5.0 (X11; Linux aarch64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 CrKey/1.54.250320 Edg/135.0.0.0'
}
for i in range(65151,65255):url="https://www.phei.com.cn/module/goods/wssd_content .jsp?bookid={}".format(i)response=requests.get(url,headers=header)html_file_name="page_{}.html".format(i)with open(html_file_name,"w",encoding='utf-8') as f:f.write(response.text)