scrapy案例——链家租房数据的爬取
案例需求:
1.使用scrapy爬虫技术爬取链家租房网站(成都租房信息_成都出租房源|房屋出租价格【成都贝壳租房】 )的数据(包括标题、价格和链接)
2.利用XPath进行数据解析
3.保存为本地json文件
分析:
请求地址:
成都租房信息_成都出租房源|房屋出租价格【成都贝壳租房】
伪装浏览器
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
关闭君子协议——注释这行即可
# ROBOTSTXT_OBEY = True
XPath解析数据
#2.解析数据 name=response.xpath('//div[@class="content__list--item"]//a[@class="content__list--item--aside"]/@title').extract() price = response.xpath('//span[@class="content__list--item-price"]/em/text()').extract() link = response.xpath('//div[@class="content__list--item"]//a[@class="content__list--item--aside"]/@href').extract() # print(name) # print(price) # print(link) for names ,prices,links in zip(name,price,link):print(names)print(prices)print(links)print('=========================')
将数据打包并发送给item
#3.将数据打包 实例化类 item = MyspiderItem() # print(item) item['name']=names item['price']=prices item['link']=links #3.2返回给引擎 yield item
items
class MyspiderItem(scrapy.Item):# define the fields for your item here like:name = scrapy.Field()#标题price = scrapy.Field()#价格link = scrapy.Field() # 价格
保存数据,并写在pplines中
class MyspiderPipeline:def __init__(self):self.file = open('lianjia.json','w',encoding='utf-8')def process_item(self, item, spider):print('管道文件的item',item)# print(type(item))dict_data = dict(item)print(type(dict_data))#2.存数据json_data = json.dumps(dict_data,ensure_ascii=False)+'\n'#ensure_ascii=False 不要让数据编程编码#写入数据 开文件self.file.write(json_data)return item#用完之后 关闭文件def __del__(self):self.file.close()
这时运行结果item为空字典——则需要开开启管道才可写入数据——settings
ITEM_PIPELINES = {"myspider.pipelines.MyspiderPipeline": 300, }
创建项目:
代码示例:
lianjia.py
import scrapy
from myspider.items import MyspiderItemclass LianjiaSpider(scrapy.Spider):name = "lianjia" #爬虫名字allowed_domains = ["lianjia.com"]#域名范围start_urls = ["https://cs.lianjia.com/zufang/"]#爬虫的起始urldef parse(self, response):# print('响应体对象',response)# print('响应源码:',response.body)# print(response.text)#1.可以先去开一个文件html_data = response.textwith open('lj.html','w')as f:f.write(html_data)#2.解析数据name=response.xpath('//div[@class="content__list--item"]//a[@class="content__list--item--aside"]/@title').extract()price = response.xpath('//span[@class="content__list--item-price"]/em/text()').extract()link = response.xpath('//div[@class="content__list--item"]//a[@class="content__list--item--aside"]/@href').extract()# print(name)# print(price)# print(link)for names ,prices,links in zip(name,price,link):print(names)print(prices)print(links)print('=========================')#3.将数据打包 实例化类item = MyspiderItem()# print(item)item['name']=namesitem['price']=pricesitem['link']=links#3.2返回给引擎yield item#4.保存数据???写在pplines
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.htmlimport scrapyclass MyspiderItem(scrapy.Item):# define the fields for your item here like:name = scrapy.Field()#标题price = scrapy.Field()#价格link = scrapy.Field() # 价格
piplines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import json
#保存数据-json数据
class MyspiderPipeline:def __init__(self):self.file = open('lianjia.json','w',encoding='utf-8')def process_item(self, item, spider):print('管道文件的item',item)# print(type(item))dict_data = dict(item)print(type(dict_data))#2.存数据json_data = json.dumps(dict_data,ensure_ascii=False)+'\n'#ensure_ascii=False 不要让数据编程编码#写入数据 开文件self.file.write(json_data)return item#用完之后 关闭文件def __del__(self):self.file.close()
运行项目文件——start.py
from scrapy import cmdline
cmdline.execute(['scrapy','crawl','lianjia','--nolog'])
运行结果: