当前位置: 首页 > news >正文

爬虫日常实战

1.爬虫爬取污染(含内部框架frame)

from selenium import webdriver
from selenium.webdriver.common.by import By
import time# 初始化 Chrome 驱动
driver = webdriver.Chrome()# 打开目标网站
driver.get('https://szzdjc.cnemc.cn:8070/GJZ/Business/Publish/Main.html')# 等待页面加载
time.sleep(5)# 切换到内部 iframe
driver.switch_to.frame(driver.find_element(By.XPATH, '//*[@id="MF"]'))# 点击按钮,加载动态内容
area_button = driver.find_element(By.XPATH, '//*[@id="ddm_Area"]/span')
area_button.click()# 等待页面加载
time.sleep(2)# 点击动态加载后的链接
link = driver.find_element(By.XPATH, '//*[@id="head_filter"]/div[1]/div/ul/li[1]/a')
link.click()# 等待数据加载
time.sleep(45)# 滑动到页面底部
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")# 等待加载完成
time.sleep(2)# 获取城市名和检测时间
cities = driver.find_elements(By.XPATH, '//*[@id="gridDatas"]/li/table/tbody/tr/td[1]/span')
detection_times = driver.find_elements(By.XPATH, '//*[@id="gridDatas"]/li/table/tbody/tr/td[4]')# 打印结果
for city, detection_time in zip(cities, detection_times):print(f"城市名: {city.text}, 检测时间: {detection_time.text}")# 切回主文档
driver.switch_to.default_content()# 关闭驱动
driver.quit()

2.代理池(89代理),selenium方法

from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import random
import requestssum = 0
proxy_arr = []
driver = webdriver.Chrome()for num in range(1, 3):url = f'https://www.89ip.cn/index_{num}.html'driver.get(url)time.sleep(3)proxy_1 = driver.find_elements(By.XPATH, '//tbody/tr/td[1]')proxy_2 = driver.find_elements(By.XPATH, '//tbody/tr/td[2]')for proxy_11, proxy_22 in zip(proxy_1, proxy_2):proxy_temp = f"{proxy_11.text}:{proxy_22.text}"proxies = {"http": f"http://{proxy_temp}", "https": f"http://{proxy_temp}"}try:response = requests.get("http://httpbin.org/ip", proxies=proxies, timeout=5, verify=False)if response.status_code == 200:sum += 1proxy_arr.append(proxy_temp)except:continueproxy = random.choice(proxy_arr) if proxy_arr else "没有可以访问的proxies"
print("一共成功可以访问的代理个数:", sum)
print("随机选择的代理:", proxy)driver.quit()

3.代理池(89代理),requests方法

import requests
import time
from lxml import etree
import randomsum = 0
proxy_arr = []
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36','cookie':'Hm_lvt_f9e56acddd5155c92b9b5499ff966848=1730287104,1730429940; HMACCOUNT=F89A075820B0EAF1; Hm_lpvt_f9e56acddd5155c92b9b5499ff966848=1730429996; https_waf_cookie=e5758711-3596-4a395e556275bcab68c58983f0d9a2ba341d; https_ydclearance=d85affb1d0d3d6275303f71f-b44c-4b4e-a087-cd1232c7e338-1730437813'
}
for num in range(1, 3):url = f'https://www.89ip.cn/index_{num}.html'response = requests.get(url, headers=headers)time.sleep(3)html = etree.HTML(response.text)proxy_ips = html.xpath('//tbody/tr/td[1]/text()')proxy_ports = html.xpath('//tbody/tr/td[2]/text()')for ip, port in zip(proxy_ips, proxy_ports):proxy_temp = f"{ip.strip()}:{port.strip()}"proxies = {"http": f"http://{proxy_temp}", "https": f"http://{proxy_temp}"}print(proxies)try:response = requests.get("http://httpbin.org/ip", proxies=proxies, timeout=5)if response.status_code == 200:sum += 1proxy_arr.append(proxy_temp)except Exception as e:continue# 输出结果
proxy = random.choice(proxy_arr) if proxy_arr else "没有可以访问的proxies"
print("一共成功可以访问的代理个数:", sum)
print("随机选择的代理:", proxy)

4.代理池(快代理)

from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import random
import requestssum = 0
proxy_arr = []
driver = webdriver.Chrome()for num in range(1, 3):url = f'https://www.kuaidaili.com/free/inha/{num}/'driver.get(url)time.sleep(3)heads = driver.find_elements(By.XPATH,'//*[@id="table__free-proxy"]/div/table/tbody/tr/td[4]')proxy_1 = driver.find_elements(By.XPATH, '//*[@id="table__free-proxy"]/div/table/tbody/tr/td[1]')proxy_2 = driver.find_elements(By.XPATH, '//*[@id="table__free-proxy"]/div/table/tbody/tr/td[2]')for head,proxy_11, proxy_22 in zip(heads,proxy_1, proxy_2):head1 = head.text.lower()proxy_temp = f"{head1}://{proxy_11.text}:{proxy_22.text}"print(proxy_temp)try:proxies = {head.text:proxy_temp}# 测试 HTTP 请求的服务。它的 ip 路径将返回你的 IP 地址,帮助确认请求是否通过代理成功。response = requests.get("http://httpbin.org/ip", proxies=proxies,timeout=5, verify=False)if response.status_code == 200:sum += 1proxy_arr.append(proxy_temp)except:continueproxy = random.choice(proxy_arr) if proxy_arr else "没有可以访问的proxies"
print("一共成功可以访问的代理个数:", sum)
print("随机选择的代理:", proxy)driver.quit()

5.爬虫搜狗多页

from selenium import webdriver
from selenium.webdriver.common.by import By
import timedriver = webdriver.Chrome()
driver.get('https://weixin.sogou.com/pcindex/')
search_box = driver.find_element(By.XPATH, '//*[@id="query"]')
search_box.send_keys('爬虫')
time.sleep(2)
search_button = driver.find_element(By.XPATH, '//*[@id="searchForm"]/div/span[2]/input')
search_button.click()
time.sleep(5)# 循环爬取10页
for page in range(10):print(f"正在爬取第 {page + 1} 页...")for num in range(10):try:title = driver.find_element(By.XPATH, f'//*[@id="sogou_vr_11002601_title_{num}"]').textname = driver.find_element(By.XPATH, f'//*[@id="sogou_vr_11002601_box_{num}"]/div[2]/div/span[1]').textprint(f"标题: {title}, 名称: {name}")except Exception as e:print(f"提取第 {num + 1} 条数据时发生错误: {e}")continuetry:next_button = driver.find_element(By.XPATH, '//*[@id="sogou_next"]')next_button.click()time.sleep(5)except Exception as e:print("没有找到下一页,或发生错误:", e)breakdriver.quit()

http://www.mrgr.cn/news/63651.html

相关文章:

  • c++入门之 命名空间与输入输出
  • hutool糊涂工具通过注解设置excel宽度
  • 【轻松学C:编程小白的大冒险】--- 选择 开发工具(IDE)Dev-c++ 03
  • Photoshop PS批处理操作教程(批量修改图片尺寸、参数等)
  • 走进 JavaScript 世界:掌握核心技能
  • 鸿蒙UI开发——带农历的日期滑动选择弹窗
  • Java项目实战II基于Java+Spring Boot+MySQL的桂林旅游景点导游平台(开发文档+数据库+源码)
  • openai api 文件分析/联网/画图代码示例
  • 2024年10月文章一览
  • 为什么服务器几乎都是Linux操作系统?
  • 怎样提取视频中的音频?分享五款好用软件!
  • 【MX-S4-T2】「yyOI R2」youyou 不喜欢夏天
  • 智能嵌入式机械臂开发攻略
  • Oracle 第18章:分区技术
  • 【AI日记】24.11.01 LangChain、openai api和github copilot
  • flex 布局比较容易犯的错误 出现边界超出的预想的情况
  • Hadoop期末复习(完整版)
  • 使用OCR识别手写文本
  • dc源码铺子应用部署教程
  • CSS3简介(一)
  • 关于SDF系列文章,写在前
  • Raspberry Pi OS 树莓派的新版本
  • [论文阅读]LOGAN: Membership Inference Attacks Against Generative Models
  • ssm+vue657基于spring和vue开发的web新闻流媒体平台
  • Go语言的使用
  • Python实现SPFA算法