lib docs

install

pip install scrapy

Creating a project

# [project_name] 은 프로젝트 이름을 적으면 된다
scrapy startproject [project_name]

간단한 스크레이퍼 작성

import scrapy

class ArticleSpider(scrapy.Spider):
	name = 'article'
	
	def start_requests(self):
		urls = [
			'url_1',
			'url_2',
			'url_3'
		]
				
		return [scrapy.Requests(url=url, callback=self.parse) for url in urls]
		
	def parse(self, response):
		url = response.url
		title = response.css('h1::text').extract_first()
		print('URL is : {}'.format(url))
		print('title is : {}'.format(title))

실행

scrapy runspider article.py
scrapy crawl article

LinkExtractor, CrawlSpider, Rule

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

class ArticleSpider(CrawlSpider):
    name = 'articles'
    allowed_domains = ['wikipedia.org']
    start_urls = ['<https://en.wikipedia.org/wiki/Benevolent_dictator_for_life>']
    rules = [
        Rule(LinkExtractor(allow='^(/wiki/)((?!:).)*$'), 
		         callback='parse_items', 
				     follow=True, 
				     cb_kwargs={'is_article': True}),
        Rule(LinkExtractor(allow='.*'), 
		         callback='parse_items', 
		         cb_kwargs={'is_article': False})
    ]

    def parse_items(self, response, is_article):
        print(response.url)
        title = response.css('h1::text').extract_first()
        if is_article:
            url = response.url
            text = response.xpath('//div[@id="mw-content-text"]//text()').extract()
            lastUpdated = response.css('li#footer-info-lastmod::text').extract_first()
            lastUpdated = lastUpdated.replace('This page was last edited on ', '')
            print('Title is: {} '.format(title))
            print('title is: {} '.format(title))
            print('text is: {}'.format(text))
        else:
            print('This is not an article: {}'.format(title))