Scrapy搭配Selenium爬取豆瓣电影250排行榜动态网页数据
参考CSDN博客:https://blog.csdn.net/qq_43213783/article/details/113063557
2024年11月11日实现。
创建movie_douban爬虫项目:
scrapy startproject movie_douban
进入spiders:
cd movie_douban/movie_douban/spiders
创建doubanMovieSpider爬虫:
scrapy genspider doubanMovieSpider movie.douban.com
修改items.py文件:
class MovieDoubanItem(scrapy.Item):
number = scrapy.Field()
name = scrapy.Field()
grade = scrapy.Field()
move_describe = scrapy.Field()
evaluate = scrapy.Field()
introduce = scrapy.Field()
image_url = scrapy.Field()
修改doubanMovieSpider.py:
import scrapy
from ..items import MovieDoubanItem
import requests, os
from scrapy import Requestclass DoubanmoviespiderSpider(scrapy.Spider):name = 'doubanMovieSpider'allowed_domains = ['movie.douban.com']start_urls = ['https://movie.douban.com/top250']# 保存图片的url列表urllist = []#获取当前文件padouban.py所在的目录dir = os.path.dirname(__file__)#图片的保存目录dir_path = dir + '/' + 'tupian'def parse(self, response):# 判断是否请求成功if response.status == 404:print('failed url')#获取电影条目列表move_lists = response.xpath("//ol[@class='grid_view']/li")for move in move_lists:item = MovieDoubanItem()#img标签,获取属性用::attr(xxx),获取内容用::textitem['name'] = ''.join(move.css('.pic a img::attr(alt)').extract()).strip()item['grade'] = ''.join(move.css('.star span.rating_num::text').extract()).strip()item['number'] = ''.join(move.css('.pic em::text').extract()).strip()item['move_describe'] = ''.join(move.css('.quote span.inq::text').extract()).strip()item['evaluate'] = ''.join(move.css('.star span:nth-child(4)::text').extract()).strip()# 电影简介,去掉空格,换行符,制表符,拼接成一个字符串introduce_content = move.xpath(".//div[@class='bd']/p[1]/text()").getall() # getall()返回一个列表,包含所有匹配的文本节点,每个元素是一个字符串introduce = ''for content in introduce_content:content_s = ''.join(content.split()) # 去掉空格,换行符,制表符introduce = introduce + content_s + ' ' # 拼接成一个字符串,每个段落之间用两个空格隔开item['introduce'] = introduce # 电影海报图片url,strip()去掉两端空格image_url = ''.join(move.css('.pic a img::attr(src)').extract()).strip() item['image_url'] = image_urlself.urllist.append(image_url)yield item# 获取下一页内容next_text = response.xpath("//div[@class='paginator']/span[@class='next']")next_page = ''.join(next_text.css('a::attr(href)').extract()).strip()if next_page:url = 'https://movie.douban.com/top250' + next_pageyield Request(url=url, callback=self.parse)# else: #爬取完所有电影条目开始下载图片。这里先不下载图片,若要下载图片,取消注释即可。# for url in self.urllist:# self.save_image(url)def save_image(self,url):# 用requests申请图片资源reponse = requests.get(url)# 获取图片名字img_name = url.split('/')[-1]#图片保存的路径img_path = self.dir_path + r'\{0}'.format(img_name)# 用os进行图片保存,保存在本地try:if not os.path.exists(self.dir_path):os.makedirs(self.dir_path)if not os.path.exists(img_path):with open(img_path, 'wb') as f:f.write(reponse.content)f.close()else:print("文件已存在")except:print("执行出错")
修改middlewares.py,使用selenium获取豆瓣动态页面的响应信息:
from scrapy.http import HtmlResponse
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
import time# 新增的selenium中间件
class seleniumMiddleware(object):def __init__(self):self.timeout = 20 # 设置超时时间self.driver = webdriver.Chrome() # 使用Chrome浏览器self.driver.set_page_load_timeout(self.timeout) # 设置页面加载超时时间self.wait = WebDriverWait(self.driver, self.timeout) # 设置等待时间def process_request(self, request, spider,):self.driver.get(request.url) # 获取请求的urltime.sleep(1) # 等待1秒return HtmlResponse(url=request.url, body=self.driver.page_source, encoding='utf-8', request=request) # 返回HtmlResponse对象
修改pipelines.py,将数据保存到数据库:
from itemadapter import ItemAdapter
from pymongo import MongoClientclass DoubanMoviePipeline(object):def open_spider(self, spider):# MongoDB 连接设置 self.MONGO_URI = 'mongodb://localhost:27017/' self.DB_NAME = 'movie' # 数据库名称 self.COLLECTION_NAME = 'doubanMovie' # 集合名称self.client = MongoClient(self.MONGO_URI)self.db = self.client[self.DB_NAME]self.collection = self.db[self.COLLECTION_NAME]# 如果集合中已有数据,清空集合self.collection.delete_many({})print('爬取开始')def process_item(self, item, spider):name = item['name']grade = item['grade']number = item['number']move_describe = item['move_describe']evaluate = item['evaluate']introduce = item['introduce']image_url = item['image_url']# 将item转换为字典item_dict = {'name': name,'grade': grade,'number': number,'move_describe': move_describe,'evaluate': evaluate,'introduce': introduce,'image_url': image_url}# 插入数据self.collection.insert_one(item_dict)print('即将插入数据')print(item_dict)print('数据插入成功')# self.collection.insert_one(dict(item))return itemdef close_spider(self, spider):print('爬取结束,显示数据库中所有元素')cursor = self.collection.find()for document in cursor:print(document)self.client.close()
修改settings.py:
解除中间件和piplines的注释并指定具体类名,在请求头中添加User-Agent字段。
SPIDER_MIDDLEWARES = {
'movie_douban.middlewares.seleniumMiddleware': 543,
}
ITEM_PIPELINES = {
'movie_douban.pipelines.DoubanMoviePipeline': 300,
}
DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
}
运行项目:
在movie_douban/movie_douban文件夹下新建run.py文件:
from scrapy import cmdline
cmdline.execute("scrapy crawl doubanMovieSpider -s LOG_ENABLED=true".split())
运行run.py,得到结果如下:
爬取250条电影数据: