https://github.com/lunarwhite/hello-scrapy-mongo
check version
create virtual env
python3 -m venv .venv
source .venv/bin/activate
# deactivate
pkg: scrapy
pip install scrapy
pkg: pymongo
python3 -m pip install 'pymongo[srv]'
dependency
pip freeze > requirements.txt
start project
scrapy startproject hello
project structure
.
├── scrapy.cfg # config file
└── hello
├── __init__.py
├── items.py # data structure
├── middlewares.py
├── pipelines.py
├── settings.py
└── spiders # directory put your spiders
├── __init__.py
└── hello_spider.py
example
# items.py
import scrapy
class HelloItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
pass
example
# hello_spider.py
import scrapy
from hello.items import HelloItem
class HelloSpider(scrapy.Spider):
name = "hello"
allowed_domains = ["stackoverflow.com"]
start_urls = [
"<http://stackoverflow.com/questions?pagesize=50&sort=newest>",
]
def parse(self, response):
questions = response.xpath('//div[@class="summary"]/h3')
for question in questions:
item = HelloItem()
item['title'] = question.xpath('a[@class="question-hyperlink"]/text()').extract()[0]
item['url'] = question.xpath('a[@class="question-hyperlink"]/@href').extract()[0]
yield item