|
- import scrapy
- import urllib.parse
- import requests
- class TbSpider(scrapy.Spider):
- name = 'tb'
- allowed_domains = ['baidu.com']
- start_urls = [
- 'https://tieba.baidu.com/mo/q---3237548E300DB3D76551F88E33D2ABB1%3AFG%3D1--1-3-0--2--wapp_1548077552162_707/m?kw=%E9%92%A6%E5%B7%9E%E5%AD%A6%E9%99%A2&lp=5011&lm=&pn=0'
- ]
- def parse(self, response):
- i_list = response.xpath('//div[contains(@class,"i")]')
- for i in i_list:
- item = {}
- item['title'] = i.xpath('./a/text()').extract_first()
- item['href'] = i.xpath('./a/@href').extract_first()
- item['href'] = urllib.parse.urljoin(response.url, item['href'])
- item['img_list'] = []
- # item['content_list'] = []
- yield scrapy.Request(
- url=item['href'],
- callback=self.parse_detail,
- meta = {'item':item}
- )
- # 列表页下一页
- next_url = response.xpath('//a[text()="下一页"]/@href').extract_first()
- if next_url is not None:
- next_url = urllib.parse.urljoin(response.url, next_url)
- scrapy.Request(
- url=next_url,
- callback=self.parse
- )
- def parse_detail(self, response):
- item = response.meta['item']
- # item['content_list'].extend(response.xpath('//div[@class="i"]//text()').extract())
- # item['img_list'] = [urllib.parse.urljoin(response.url,i) for i in item['img_list']]
- # item['img_list'] = [requests.utils.unquote(i).split('src=')[-1] for i in item['img_list']]
- # item['img_list'].extend(response.xpath('//div[@class="i"]/a/@href').extract())
-
- # 详情页下一页
- next_url = response.xpath('//a[text()="下一页"]/@href').extract_first()
- if next_url is not None:
- next_url = urllib.parse.urljoin(response.url, next_url)
- yield scrapy.Request(
- url=next_url,
- callback=self.parse_detail,
- meta={'item':item}
- )
- else:
- print(item)
- # yield item
复制代码
|
|