Scrapy is a very powerful framework that enables you to crawl web pages and extract information you need.
Installation
pip install scrapy
Start the project
To start a project go into the folder you would like the project to reside in and type:
scrapy startproject tutorial
This will generate the initial files:
Create crawler
Here is a sample crawler that extracts links:
import scrapy
class Item(scrapy.Item):
link = scrapy.Field()
class BlogSpider(scrapy.Spider):
name = 'getabc123'
allowed_domains = ['kulturnicenterq.org/lgbtqslovar']
start_urls = ['https://www.kulturnicenterq.org/lgbtqslovar/']
def parse(self, response):
for li in response.xpath('//*[@id="header"]/nav/ul/li'):
link = li.css("li a::attr(href)").extract_first()
yield scrapy.Request(
url=link,
callback=self.parse_subpage,
dont_filter=True
)
def parse_subpage(self, response):
# Add the first page to the list
yield Item(link=response.url)
for nav in response.xpath('//a[contains(@class, "page")]'):
link = nav.css("a::attr(href)").extract_first()
# Add subpages to the list
yield Item(link=link)
And a more COMPLEX one that parses a couple of pages before it gets to the data we need:
import re
import scrapy
class Item(scrapy.Item):
term = scrapy.Field()
origin = scrapy.Field()
synonyms = scrapy.Field()
meaning = scrapy.Field()
examples = scrapy.Field()
importance = scrapy.Field()
new = scrapy.Field()
class Pages(scrapy.Spider):
name = 'allinone'
allowed_domains = ['kulturnicenterq.org/lgbtqslovar']
start_urls = ['https://www.kulturnicenterq.org/lgbtqslovar/']
def parse(self, response):
""" Get links from a-z navigation in the header
"""
for li in response.xpath('//*[@id="header"]/nav/ul/li'):
link = li.css("li a::attr(href)").extract_first()
# Check a-z pages for any subpages
yield scrapy.Request(
url=link,
callback=self.parse_a_z,
dont_filter=True
)
def parse_a_z(self, response):
""" Parse a-z pages and get subpages eg. (/page/2/...)
"""
links = []
# Add the page 1
links.append(response.url)
# Check pagination if there are any additional pages 2, 3, 4 ...
for pagination in response.xpath('//a[contains(@class, "page")]'):
link = pagination.css("a::attr(href)").extract_first()
# Add subpage to the list
links.append(link)
# Go through all subpages links to get links of terms
for link in links:
yield scrapy.Request(
url=link,
callback=self.parse_subpages_for_terms,
dont_filter=True
)
def parse_subpages_for_terms(self, response):
""" Get term page link
"""
for item in response.xpath('//*[@id="middle"]/div/div[1]/h1/a'):
link = item.css("a::attr(href)").extract_first()
# Go to all terms pages to get data
yield scrapy.Request(
url=link,
callback=self.parse_term,
dont_filter=True
)
def parse_term(self, response):
""" Parse term page to get all info
"""
TITLE_XPATH = '//*[@id="middle"]/div/div[1]/h1/a'
NEW_XPATH = '//h1[contains(@class, "novo")]'
ORIGIN_XPATH = '//*[@id="content"]/p/text()'
SYNONYMS_XPATH = '//*[@id="related-posts-MRP_all"]/ul/li/a/text()'
DESCRIPTIONS_XPATH = '//*[@id="content"]/ol/li/p/text()'
DESCRIPTION_XPATH = '//*[@id="content"]/ol/li[{}]/p/text()'
EXAMPLES_XPATH = '//*[@id="content"]/ul/li/p/text()'
EXAMPLE_XPATH = '//*[@id="content"]/ul/li[{}]/p/text()'
for item in response.xpath('//*[@id="middle"]/div/div[1]'):
# Get term
a = item.xpath(TITLE_XPATH)
title = a.css("a::text").extract_first()
term = title.strip()
# Get the new status - new word
new = item.xpath(NEW_XPATH).extract()
new = True if new else False
# Get origin - ang., gr., kratica
origin = item.xpath(ORIGIN_XPATH).extract()
if (len(origin) > 0):
origin = origin[0]
# Get synonyms
synonyms = item.xpath(SYNONYMS_XPATH).extract()
# Get descriptions
descriptions = item.xpath(DESCRIPTIONS_XPATH).extract()
descriptions_count = len(descriptions)
# Parse descriptions and examples
while (descriptions_count > 0):
# Get description
description = item.xpath(
DESCRIPTION_XPATH.format(descriptions_count)).extract()
if (description):
meaning = description[0]
else:
meaning = []
# Get example
examples = item.xpath(
EXAMPLE_XPATH.format(descriptions_count)).extract()
# Remove default copy if there are no examples
if (examples and examples[0] == 'Primerov \u0161e ni.'):
examples[0] = ''
# Remove: 1| 2| 3| at the beginning of examples
if (examples):
found = re.match(r'^\d+\|\s', examples[0])
if (found):
examples[0] = examples[0][len(found.group()):]
descriptions_count -= 1
# Save the data
yield Item(term=term, new=new, origin=origin, synonyms=synonyms, meaning=meaning,
examples=examples, importance=descriptions_count + 1)
List all available crawlers
scrapy list
Run crawler
scrapy crawl quotes
# output csv
scrapy crawl dictionary -o dictionary.csv
# output json
scrapy crawl dictionary -o dictionary.json
Scrapy shell
Scrapy shell is a great tool for discovery. You can play with it to see the data it returns before you use it in the script.
scrapy shell "https://www.cnn.com"
response.css('title')
response.css('title').extract()
response.css('title::text').extract() # list
response.css('title::text')[0].extract() # string
response.css('title::text').extract_first() # string
# CSS
response.css('title::text').re(r'Names.*')
['Names to Remember']
response.css('title::text').re(r'N\w+')
['Names']
response.css('title::text').re(r'(\w+) to (\w+)')
['Names', 'Remember']
quote.css("span.text::text").extract_first()
quote.css("div.tags a.tag::text").extract()
# XPATH
response.xpath('//title')
[<Selector xpath='//title' data='<title>Names to Remember</title>'>]
response.xpath('//title/text()').extract_first()
'Names to Remember'