一、抓取jobbole网站
1.scrapy调试
在项目根目录自己写一个main.py,调用命令行。
from scrapy.cmdline import execute import sys import os sys.path.append(os.path.dirname(os.path.abspath(__file__))) execute(["scrapy","crawl","jobbole"])
并将settings.py中的robot设为False
Robotstext_obey = False
2.将列表页中的缩略图传递给request
meta={"front_image_url":image_url},callback=...
用get取值,不会抛异常,最后面的表示默认为空:
3.下载图片
如果出现valueerror,则要将这里改为数组。
重写pipeline
4.MD5
from Axxx.utils.common import get_md5 def get_md5(url): if isinstance(url,str): url = url.encode('utf-8') m = hashlib.md5() m.update(url) return m.hexdigest()
5.通过navicat添加字段
一定要设置一个主键:
6.将数据保存到Mysql
(1)同步机制
import MySQLdb class MysqlPipeline(object): #采用同步的机制写入mysql def __init__(self): self.conn = MySQLdb.connect('127.0.0.1', 'root', 'password', 'article_spider', charset="utf8", use_unicode=True) self.cursor = self.conn.cursor() def process_item(self, item, spider): insert_sql = """ insert into jobbole_article(title, url, create_date, fav_nums) VALUES (%s, %s, %s, %s) """ self.cursor.execute(insert_sql, (item["title"], item["url"], item["create_date"], item["fav_nums"])) self.conn.commit()
然后再在settings.py中,将pipeline修改为这个。
(2)异步操作
seeting.py设置
MYSQL_HOST = "127.0.0.1" MYSQL_DBNAME = "article_spider" MYSQL_USER = "root" MYSQL_PASSWORD = "123456"
然后再写Pipline
import MySQLdb.cursors from twisted.enterprise import adbapi #连接池ConnectionPool # def __init__(self, dbapiName, *connargs, **connkw): class MysqlTwistedPipline(object): def __init__(self, dbpool): self.dbpool = dbpool @classmethod def from_settings(cls, settings): dbparms = dict( host = settings["MYSQL_HOST"], db = settings["MYSQL_DBNAME"], user = settings["MYSQL_USER"], passwd = settings["MYSQL_PASSWORD"], charset='utf8', cursorclass=MySQLdb.cursors.DictCursor, use_unicode=True, ) #**dbparms-->("MySQLdb",host=settings['MYSQL_HOST'] dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms) return cls(dbpool) def process_item(self, item, spider): #使用twisted将mysql插入变成异步执行 query = self.dbpool.runInteraction(self.do_insert, item) query.addErrback(self.handle_error, item, spider) #处理异常 def handle_error(self, failure, item, spider): #处理异步插入的异常 print (failure) def do_insert(self, cursor, item): #执行具体的插入 #根据不同的item 构建不同的sql语句并插入到mysql中 insert_sql, params = item.get_insert_sql() cursor.execute(insert_sql, params)
7.直接写入django
https://github.com/scrapy-plugins/scrapy-djangoitem