scrapy翻页问题:实现不了翻页,第一页 爬完就停止了,检...
import scrapyimport urllib.parse
import requests
class TbSpider(scrapy.Spider):
name = 'tb'
allowed_domains = ['baidu.com']
start_urls = [
'https://tieba.baidu.com/mo/q---3237548E300DB3D76551F88E33D2ABB1%3AFG%3D1--1-3-0--2--wapp_1548077552162_707/m?kw=%E9%92%A6%E5%B7%9E%E5%AD%A6%E9%99%A2&lp=5011&lm=&pn=0'
]
def parse(self, response):
i_list = response.xpath('//div')
for i in i_list:
item = {}
item['title'] = i.xpath('./a/text()').extract_first()
item['href'] = i.xpath('./a/@href').extract_first()
item['href'] = urllib.parse.urljoin(response.url, item['href'])
item['img_list'] = []
# item['content_list'] = []
yield scrapy.Request(
url=item['href'],
callback=self.parse_detail,
meta = {'item':item}
)
# 列表页下一页
next_url = response.xpath('//a/@href').extract_first()
if next_url is not None:
next_url = urllib.parse.urljoin(response.url, next_url)
scrapy.Request(
url=next_url,
callback=self.parse
)
def parse_detail(self, response):
item = response.meta['item']
# item['content_list'].extend(response.xpath('//div[@class="i"]//text()').extract())
# item['img_list'] = ]
# item['img_list'] = for i in item['img_list']]
# item['img_list'].extend(response.xpath('//div[@class="i"]/a/@href').extract())
# 详情页下一页
next_url = response.xpath('//a/@href').extract_first()
if next_url is not None:
next_url = urllib.parse.urljoin(response.url, next_url)
yield scrapy.Request(
url=next_url,
callback=self.parse_detail,
meta={'item':item}
)
else:
print(item)
# yield item
页:
[1]