当前位置：首页 > news >正文

爬虫日常实战

news 2025/4/3 10:49:38

1.爬虫爬取污染（含内部框架frame）

from selenium import webdriver
from selenium.webdriver.common.by import By
import time# 初始化 Chrome 驱动
driver = webdriver.Chrome()# 打开目标网站
driver.get('https://szzdjc.cnemc.cn:8070/GJZ/Business/Publish/Main.html')# 等待页面加载
time.sleep(5)# 切换到内部 iframe
driver.switch_to.frame(driver.find_element(By.XPATH, '//*[@id="MF"]'))# 点击按钮，加载动态内容
area_button = driver.find_element(By.XPATH, '//*[@id="ddm_Area"]/span')
area_button.click()# 等待页面加载
time.sleep(2)# 点击动态加载后的链接
link = driver.find_element(By.XPATH, '//*[@id="head_filter"]/div[1]/div/ul/li[1]/a')
link.click()# 等待数据加载
time.sleep(45)# 滑动到页面底部
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")# 等待加载完成
time.sleep(2)# 获取城市名和检测时间
cities = driver.find_elements(By.XPATH, '//*[@id="gridDatas"]/li/table/tbody/tr/td[1]/span')
detection_times = driver.find_elements(By.XPATH, '//*[@id="gridDatas"]/li/table/tbody/tr/td[4]')# 打印结果
for city, detection_time in zip(cities, detection_times):print(f"城市名: {city.text}, 检测时间: {detection_time.text}")# 切回主文档
driver.switch_to.default_content()# 关闭驱动
driver.quit()

2.代理池（89代理），selenium方法

from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import random
import requestssum = 0
proxy_arr = []
driver = webdriver.Chrome()for num in range(1, 3):url = f'https://www.89ip.cn/index_{num}.html'driver.get(url)time.sleep(3)proxy_1 = driver.find_elements(By.XPATH, '//tbody/tr/td[1]')proxy_2 = driver.find_elements(By.XPATH, '//tbody/tr/td[2]')for proxy_11, proxy_22 in zip(proxy_1, proxy_2):proxy_temp = f"{proxy_11.text}:{proxy_22.text}"proxies = {"http": f"http://{proxy_temp}", "https": f"http://{proxy_temp}"}try:response = requests.get("http://httpbin.org/ip", proxies=proxies, timeout=5, verify=False)if response.status_code == 200:sum += 1proxy_arr.append(proxy_temp)except:continueproxy = random.choice(proxy_arr) if proxy_arr else "没有可以访问的proxies"
print("一共成功可以访问的代理个数:", sum)
print("随机选择的代理:", proxy)driver.quit()

3.代理池（89代理），requests方法

import requests
import time
from lxml import etree
import randomsum = 0
proxy_arr = []
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36','cookie':'Hm_lvt_f9e56acddd5155c92b9b5499ff966848=1730287104,1730429940; HMACCOUNT=F89A075820B0EAF1; Hm_lpvt_f9e56acddd5155c92b9b5499ff966848=1730429996; https_waf_cookie=e5758711-3596-4a395e556275bcab68c58983f0d9a2ba341d; https_ydclearance=d85affb1d0d3d6275303f71f-b44c-4b4e-a087-cd1232c7e338-1730437813'
}
for num in range(1, 3):url = f'https://www.89ip.cn/index_{num}.html'response = requests.get(url, headers=headers)time.sleep(3)html = etree.HTML(response.text)proxy_ips = html.xpath('//tbody/tr/td[1]/text()')proxy_ports = html.xpath('//tbody/tr/td[2]/text()')for ip, port in zip(proxy_ips, proxy_ports):proxy_temp = f"{ip.strip()}:{port.strip()}"proxies = {"http": f"http://{proxy_temp}", "https": f"http://{proxy_temp}"}print(proxies)try:response = requests.get("http://httpbin.org/ip", proxies=proxies, timeout=5)if response.status_code == 200:sum += 1proxy_arr.append(proxy_temp)except Exception as e:continue# 输出结果
proxy = random.choice(proxy_arr) if proxy_arr else "没有可以访问的proxies"
print("一共成功可以访问的代理个数:", sum)
print("随机选择的代理:", proxy)

4.代理池（快代理）

from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import random
import requestssum = 0
proxy_arr = []
driver = webdriver.Chrome()for num in range(1, 3):url = f'https://www.kuaidaili.com/free/inha/{num}/'driver.get(url)time.sleep(3)heads = driver.find_elements(By.XPATH,'//*[@id="table__free-proxy"]/div/table/tbody/tr/td[4]')proxy_1 = driver.find_elements(By.XPATH, '//*[@id="table__free-proxy"]/div/table/tbody/tr/td[1]')proxy_2 = driver.find_elements(By.XPATH, '//*[@id="table__free-proxy"]/div/table/tbody/tr/td[2]')for head,proxy_11, proxy_22 in zip(heads,proxy_1, proxy_2):head1 = head.text.lower()proxy_temp = f"{head1}://{proxy_11.text}:{proxy_22.text}"print(proxy_temp)try:proxies = {head.text:proxy_temp}# 测试 HTTP 请求的服务。它的 ip 路径将返回你的 IP 地址，帮助确认请求是否通过代理成功。response = requests.get("http://httpbin.org/ip", proxies=proxies,timeout=5, verify=False)if response.status_code == 200:sum += 1proxy_arr.append(proxy_temp)except:continueproxy = random.choice(proxy_arr) if proxy_arr else "没有可以访问的proxies"
print("一共成功可以访问的代理个数:", sum)
print("随机选择的代理:", proxy)driver.quit()

5.爬虫搜狗多页

from selenium import webdriver
from selenium.webdriver.common.by import By
import timedriver = webdriver.Chrome()
driver.get('https://weixin.sogou.com/pcindex/')
search_box = driver.find_element(By.XPATH, '//*[@id="query"]')
search_box.send_keys('爬虫')
time.sleep(2)
search_button = driver.find_element(By.XPATH, '//*[@id="searchForm"]/div/span[2]/input')
search_button.click()
time.sleep(5)# 循环爬取10页
for page in range(10):print(f"正在爬取第 {page + 1} 页...")for num in range(10):try:title = driver.find_element(By.XPATH, f'//*[@id="sogou_vr_11002601_title_{num}"]').textname = driver.find_element(By.XPATH, f'//*[@id="sogou_vr_11002601_box_{num}"]/div[2]/div/span[1]').textprint(f"标题: {title}, 名称: {name}")except Exception as e:print(f"提取第 {num + 1} 条数据时发生错误: {e}")continuetry:next_button = driver.find_element(By.XPATH, '//*[@id="sogou_next"]')next_button.click()time.sleep(5)except Exception as e:print("没有找到下一页，或发生错误:", e)breakdriver.quit()

查看全文

http://www.mrgr.cn/news/63651.html