1、python版本3.6.1
2、python编辑器:JetBrains PyCharm
2、安装virtualenvwrapper-win
pip3 install virtualenvwrapper-win
3、创建项目
1 mkvirtualenv spider_article2 pip install C:\Users\CR\Downloads\Twisted-17.5.0-cp36-cp36m-win_amd64.whl 3 pip install pypiwin324 pip install -i https://pypi.douban.com/simple/ scrapy5 pip install mysqlclient6 pip install pillow
4、现在项目存放位置:
1、打开cmd
2、workon spider_article
3、scrapy startproject ArticleSpider
4、cd ArticleSpider
5、scrapy genspider jobbole blog.jobbole.com
5、ArticleSpider文件夹下面创建调试文件
1 from scrapy.cmdline import execute2 3 import sys4 import os5 6 sys.path.append(os.path.dirname(os.path.abspath(__file__)))7 execute(["scrapy","crawl","jobbole"])
6、主要代码
jobbole.py文件内容
1 import scrapy 2 import re 3 import datetime 4 from scrapy.http import Request 5 from urllib import parse #python2中import urlparse 6 from scrapy.loader import ItemLoader 7 8 9 from ArticleSpider.items import JobBoleArticleItem,ArticleItemLoader10 11 from ArticleSpider.utils.common import get_md512 13 14 class JobboleSpider(scrapy.Spider):15 name = 'jobbole'16 allowed_domains = ['blog.jobbole.com']17 start_urls = ['http://blog.jobbole.com/all-posts/']18 19 def parse(self, response):20 21 '''22 1、获取文章列表页面中的文章url并交给scrapy下载后解析23 2、获取下一页的url并交给scrapy进行下载,下载完成后交给parse24 :param response:25 :return:26 '''27 #extract 一旦执行就会返回一个数组28 post_nodes = response.css("#archive .floated-thumb .post-thumb a")29 for post_node in post_nodes:30 image_url = post_node.css("img::attr(src)").extract_first("")31 #取出当前文章的域名32 #Request(url=post_url,callback=self.parse_detail)33 post_url = post_node.css("::attr(href)").extract_first("")34 #parse.urljoin post_url如果没有域名就从response中提取域名防进行;如果post_url有域名,response就不会起作用35 yield Request(url=parse.urljoin(response.url,post_url),meta={ "front_image_url":image_url}, callback=self.parse_detail)36 37 #提取下一页并交给scrapy进行下载38 next_urls = response.css(".next.page-numbers::attr(href)").extract_first("")39 if next_urls:40 yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse)41 42 def parse_detail(self,response):43 '''44 获取文章的具体字段45 :param response: 46 :return: 47 '''48 front_image_url = response.meta.get("front_image_url", "") #文章封面图49 item_loader = ArticleItemLoader(item=JobBoleArticleItem(),response=response)50 item_loader.add_css("title",".entry-header h1::text")51 item_loader.add_value("url", response.url)52 item_loader.add_value("front_image_url",[front_image_url])53 item_loader.add_value("url_object_id",get_md5(response.url))54 item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")55 item_loader.add_css("praise_nums", ".vote-post-up h10::text")56 item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")57 item_loader.add_css("fav_nums", ".bookmark-btn::text")58 item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")59 item_loader.add_css("content", "div.entry")60 article_item = item_loader.load_item()61 yield article_item
2、items.py文件
1 import datetime 2 import re 3 4 import scrapy 5 from scrapy.loader import ItemLoader 6 from scrapy.loader.processors import MapCompose,TakeFirst,Join 7 8 9 class ArticlespiderItem(scrapy.Item):10 # define the fields for your item here like:11 # name = scrapy.Field()12 pass13 14 def date_convert(vlaue):15 try:16 create_date = datetime.datetime.strptime(vlaue,"%Y/%m/%d").date()17 except Exception as e:18 create_date = datetime.datetime.now().date()19 return create_date20 21 22 def get_nums(value):23 match_re = re.match(".*?(\d+).*", value)24 if match_re:25 nums = int(match_re.group(1))26 else:27 nums = 028 return nums29 30 def remove_comment_tags(value):31 #去掉tag中提取的评论32 if "评论" in value:33 return ""34 else:35 return value36 37 def return_value(value):38 return value39 40 class ArticleItemLoader(ItemLoader):41 #自定义ItemLoader42 default_output_processor = TakeFirst()43 44 45 class JobBoleArticleItem(scrapy.Item):46 title = scrapy.Field()47 # title = scrapy.Field(48 # input_processor = MapCompose(lambda x:x+'-jobbole')49 # )50 create_date = scrapy.Field(51 input_processor = MapCompose(date_convert),52 #output_processor = TakeFirst(),#只取第一个53 )54 url = scrapy.Field()55 url_object_id = scrapy.Field()56 front_image_url = scrapy.Field()57 front_image_path = scrapy.Field(58 input_processor=MapCompose(return_value)59 )60 praise_nums = scrapy.Field(61 input_processor = MapCompose(get_nums)62 )63 comment_nums = scrapy.Field(64 input_processor = MapCompose(get_nums)65 )66 fav_nums = scrapy.Field(67 input_processor = MapCompose(get_nums)68 )69 tags = scrapy.Field(70 input_processor=MapCompose(remove_comment_tags),71 output_processor=Join(",")72 )73 content = scrapy.Field()
3、piplines.py文件
import codecsimport jsonfrom scrapy.pipelines.images import ImagesPipelinefrom scrapy.exporters import JsonItemExporterfrom twisted.enterprise import adbapi #adbapi可以将MySQLdb的一些操作变成异步化的操作import MySQLdbimport MySQLdb.cursorsclass ArticlespiderPipeline(object): def process_item(self, item, spider): return itemclass JsonWithEncodingPipeline(object): #自定义json文件的导出 def __init__(self): self.file = codecs.open('article.json','w',encoding="utf-8") def process_item(self, item, spider): lines = json.dumps(dict(item),ensure_ascii=False) + '\n' self.file.write(lines) return item def spider_closed(self,spider): self.file.close()class MysqlPipeline(object): def __init__(self): self.conn = MySQLdb.connect("localhost","root","","article_spider",charset="utf8",use_unicode=True) self.cursor = self.conn.cursor() def process_item(self, item, spider): inset_sql = ''' INSERT INTO article (title,url,create_date,fav_nums) VALUES (%s,%s,%s,%s) ''' self.cursor.execute(inset_sql,(item['title'],item['url'],item['create_date'],item['fav_nums'])) self.conn.commit()class MysqlTwistedPipeline(object): def __init__(self,dbpool): self.dbpool = dbpool @classmethod def from_settings(cls,settings): dbparms = dict ( host = settings["MYSQL_HOST"], db = settings["MYSQL_DBNAME"], user = settings["MYSQL_USER"], passwd = settings["MYSQL_PASSWORD"], charset = 'utf8', cursorclass = MySQLdb.cursors.DictCursor, use_unicode = True, ) dbpool = adbapi.ConnectionPool("MySQLdb",**dbparms) return cls(dbpool) def process_item(self, item, spider): ''' 使用Twisted将mysql插入变成异步执行 :param item: :param spider: :return: ''' query = self.dbpool.runInteraction(self.do_insert,item) query.addErrback(self.handle_error,item,spider)#处理异常 def handle_error(self,failure,item,spider): #处理异步插入的异常 print(failure) def do_insert(self,cursor,item): #执行具体的插入 inset_sql = ''' INSERT INTO article (title,url,create_date,fav_nums) VALUES (%s,%s,%s,%s) ''' cursor.execute(inset_sql, (item['title'], item['url'], item['create_date'], item['fav_nums']))class JsonExporterPipleline(object): #调用scrapy提供的json export导出json文件 def __init__(self): self.file = open('articleexport.json', 'wb') self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return itemclass ArticleImagePipline(ImagesPipeline): def item_completed(self, results, item, info): if "front_image_path" in item: for ok,value in results: image_file_path = value["path"] item["front_image_path"] = image_file_path return item
4、创建公共函数(位置存放ArticleSpider/utils/common.py)
1 import hashlib 2 3 def get_md5(url): 4 if isinstance(url,str): 5 url = url.encode("utf-8") 6 m = hashlib.md5() 7 m.update(url) 8 return m.hexdigest() 9 10 if __name__ == '__main__':11 print(get_md5("http://jobbole.com"))
5、配置settings文件
import osITEM_PIPELINES = { 'scrapy.pipelines.images.ImagesPipeline':1,#图片下载 'ArticleSpider.pipelines.MysqlTwistedPipeline': 3,}IMAGES_URLS_FIELD = 'front_image_url'project_dir = os.path.abspath(os.path.dirname(__file__))IMAGES_STORE = os.path.join(project_dir,"images") #指定图片存储路径MYSQL_HOST = 'localhost'MYSQL_DBNAME = 'article_spider'MYSQL_USER = 'root'MYSQL_PASSWORD = ''