爬虫-pyspider

Mon, Feb 5, 2018

爬虫

一、python

二、charles

抓包软件

使用

手机和电脑登录同一个wifi，help-local ip address获取本地ip，

在手机选择对应的wifi-修改网络-高级选项-代理-手动-填写ip和端口（一般为888）

https需要SSL

参考：Charles 从入门到精通

三、pyspider

3.1windows安装:

3.1.1安装python

下载地址

3.1.2安装对应python版本的pycurl

下载地址

3.1.3配置config文件

{
  "taskdb": "mysql+taskdb://root:0120@127.0.0.1:3306/taskdb",
  "projectdb": "mysql+projectdb://root:0120@127.0.0.1:3306/projectdb",
  "resultdb": "mysql+resultdb://root:0120@127.0.0.1:3306/resultdb",
  "queue-maxsize": 2000,
  "webui": {
    "port": 5000,
    "process-time-limit": 300
  },
  "scheduler": {
    "fail-pause-num": 0,
    "active-tasks": 2000,
    "loop-limit":2000,
    "threads":8
  },
  "processor": {
    "process-time-limit": 300
  },
  "fetcher": {
    "poolsize":2000
  },
  "all": {
    "fetcher-num": 20,
    "processor-num": 20
  }
}


#json文件放到pyspider的安装地址
C:\Users\Administrator\AppData\Local\Programs\Python\Python35\Lib\site-packages\pyspider

3.1.4通过配置文件启动pyspider

# pyspider的安装目录下启动
pyspider -c pyspider-config.json

3.1.5打开localhost:5000

3.2MAC

启动pyspider虚拟环境

source venv/bin/activate

使用自己的配置启动pyspider

pyspider -c pyspider-config.json

打开http://localhost:5000/进入pyspider页面

点击create-把页面右边的代码复制下来，修改为自己的爬虫。

参考：pyspider官方

参考：pyspider文档

pyspider中内容选择器常用方法汇总

Python中PyQuery库的使用总结

在Mac上安装phantomjs及运行

MySQL

参考：https://segmentfault.com/q/1010000006507996

pip search mysql-connector | grep --color mysql-connector-python
然后选择安装版本
pip install mysql-connector-python-rf==2.1.3

四、爬虫代码

B站热门

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-02-03 16:36:48
# Project: netease

from pyspider.libs.base_handler import *
import time
import mysql.connector
import json


class Handler(BaseHandler):
    crawl_config = {
    }

    mysql_config = {
        'host': 'localhost',
        'user': '***',
        'password': '******',
        'port': 3306,
        'database': '*****',
        'charset': 'utf8'
    }

    cnx = mysql.connector.connect(**mysql_config)
    cursor = cnx.cursor()

    table_name = 'bilibili_ranking'

    @every(minutes=1 * 60)
    def on_start(self):
            self.crawl(u'https://api.bilibili.com/x/web-interface/ranking?rid=0&day=3&jsonp=jsonp',
                       callback=self.index_page,)

    @config(age=0)
    def index_page(self, response):
        data1 = json.loads(response.text)
        data = data1['data']
        article_list = data['list']
        result_list = []
        for article in article_list:
            result = {}
            result['aid'] = article['aid'].encode('utf8', 'ignore')
            result['author'] = article['author'].encode('utf8', 'ignore')
            result['coins'] = str(article['coins']).encode('utf8', 'ignore')
            result['duration'] = article['duration'].encode('utf8', 'ignore')
            result['mid'] = str(article['mid'])
            result['pic'] = article['pic'].encode('utf8', 'ignore')
            result['play'] = str(article['play'])
            result['pts'] = str(article['pts'])
            result['title'] = article['title'].encode('utf8', 'ignore')
            result['trend'] = str(article['trend'])
            result['video_review'] = str(article['video_review'])
            result['update_time'] = time.strftime('%Y-%m-%d %H:%M:%S').encode('utf8', 'ignore')
            result['table_name'] = self.table_name
            result_list.append(result)
        return result_list

    def on_result(self, result):
        if not result:
            return
        print(result)
        for item in result:
            self.process_result(item)

    def process_result(self, result):
        table_name = result.pop('table_name')
        col_str = ''
        row_str = ''
        for key in result.keys():
            col_str = col_str + " " + key + ","
            row_str = "{}'{}',".format(row_str,
                                       result[key] if "'" not in result[key] else result[key].replace("'", "\\'"))
            sql = "insert ignore INTO {} ({}) VALUES ({}) ON DUPLICATE KEY UPDATE ".format(table_name, col_str[1:-1],
                                                                                           row_str[:-1])
        for (key, value) in six.iteritems(result):
            sql += "{} = '{}', ".format(key, value if "'" not in value else value.replace("'", "\\'"))
        sql = sql[:-2]
        self.cursor.execute(sql)
        self.cnx.commit()

数据库（MySQL）

CREATE TABLE `bilibili_ranking` (
  `aid` varchar(32) NOT NULL DEFAULT '',
  `author` varchar(32) DEFAULT NULL,
  `coins` varchar(32) DEFAULT NULL,
  `duration` varchar(32) DEFAULT NULL,
  `mid` varchar(32) DEFAULT NULL,
  `pic` varchar(128) DEFAULT NULL,
  `play` varchar(32) DEFAULT NULL,
  `pts` varchar(128) DEFAULT NULL,
  `update_time` datetime DEFAULT NULL,
  `title` varchar(32) DEFAULT NULL,
  `trend` varchar(32) DEFAULT NULL,
  `video_review` varchar(32) DEFAULT NULL,
  PRIMARY KEY (`aid`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

链家二手房

以杭州为例

from pyspider.libs.base_handler import *

start_page = 1
stop_page = 100


class Handler(BaseHandler):
    crawl_config = {
    }

    def __init__(self):
        self.url = "https://m.lianjia.com/hz/ershoufang/index/pg"
        self.start_page = start_page
        self.stop_page = stop_page

    @every(minutes=24 * 60)
    def on_start(self):
        while self.start_page < self.stop_page:
            url = self.url + str(self.start_page)
            self.crawl(url, callback=self.index_page)
            self.start_page += 1

    @config(age=0)
    def index_page(self, response):
        for each in response.doc('li[class="pictext"]').items():
            detail_url = each('a').attr.href
            print (detail_url)
            self.crawl(detail_url, callback=self.detail_page)

    @config(priority=2)
    def detail_page(self, response):
        return {
            "url": response.url,
            "标题": response.doc('title').text().encode('utf8', 'ignore'),
            "售价": response.doc('.similar_data > .similar_data_detail > p > span').text().encode('utf8', 'ignore'),
            "单价": response.doc('ul > .short').eq(0).text().encode('utf8', 'ignore').split("：")[1],
            "挂牌": response.doc('ul > .short').eq(1).text().encode('utf8', 'ignore').split("：")[1],
            "朝向": response.doc('ul > .short').eq(2).text().encode('utf8', 'ignore').split("：")[1],
            "楼层": response.doc('ul > .short').eq(3).text().encode('utf8', 'ignore').split("：")[1],
            "楼型": response.doc('ul > .short').eq(4).text().encode('utf8', 'ignore').split("：")[1],
            "电梯": response.doc('ul > .short').eq(5).text().encode('utf8', 'ignore').split("：")[1],
            "装修": response.doc('ul > .short').eq(6).text().encode('utf8', 'ignore').split("：")[1],
            "年代": response.doc('ul > .short').eq(7).text().encode('utf8', 'ignore').split("：")[1],
            "用途": response.doc('ul > .short').eq(8).text().encode('utf8', 'ignore').split("：")[1],
            "权属": response.doc('ul > .short').eq(9).text().encode('utf8', 'ignore').split("：")[1],
            "房源编码": response.doc('ul > .long').eq(0).text().encode('utf8', 'ignore').split("：")[1],
            "首付预算": response.doc('ul > .long').eq(1).text().encode('utf8', 'ignore').split("：")[1],
            "标签": response.doc('.tag_group').text().encode('utf8', 'ignore'),
            "小区": response.doc('ul > .long').eq(2).text().encode('utf8', 'ignore').split("：")[1],
            "房源介绍": response.doc('.sub_mod_box > .mod_cont').eq(2).text().encode('utf8', 'ignore'),
            "经纪人带看反馈": response.doc('.sub_mod_box > .mod_cont').eq(3).text().encode('utf8', 'ignore'),
            "房源户型": response.doc('.info_li > .info_content').eq(0).text().encode('utf8', 'ignore'),
            "建筑面积": response.doc('.info_li > .info_content').eq(1).text().encode('utf8', 'ignore'),
            "套内面积": response.doc('.info_li > .info_content').eq(2).text().encode('utf8', 'ignore'),
            "户型结构": response.doc('.info_li > .info_content').eq(3).text().encode('utf8', 'ignore'),
            "梯户比例": response.doc('.info_li > .info_content').eq(4).text().encode('utf8', 'ignore'),
            "供暖方式": response.doc('.info_li > .info_content').eq(5).text().encode('utf8', 'ignore'),
            "上次交易": response.doc('.info_li > .info_content').eq(6).text().encode('utf8', 'ignore'),
            "购房年限": response.doc('.info_li > .info_content').eq(7).text().encode('utf8', 'ignore'),
            "房屋用途": response.doc('.info_li > .info_content').eq(8).text().encode('utf8', 'ignore'),
            "交易权属": response.doc('.info_li > .info_content').eq(9).text().encode('utf8', 'ignore'),
            "产权所属": response.doc('.info_li > .info_content').eq(10).text().encode('utf8', 'ignore'),
            "抵押信息": response.doc('.info_li > .info_content').eq(11).text().encode('utf8', 'ignore'),
            "房本备件": response.doc('.info_li > .info_content').eq(12).text().encode('utf8', 'ignore'),
            "看房时间": response.doc('.info_li > .info_content').eq(13).text().encode('utf8', 'ignore'),
            "链家编号": response.doc('.info_li > .info_content').eq(14).text().encode('utf8', 'ignore'),
            "土地年限": response.doc('.info_li > .info_content').eq(15).text().encode('utf8', 'ignore'),
            "近7日带看(次)":
                response.doc('.mod_box > .mod_cont > .data > .box_col').eq(0).text().encode('utf8', 'ignore').split(
                    ' ')[1],
            "30日带看(次)":
                response.doc('.mod_box > .mod_cont > .data > .box_col').eq(1).text().encode('utf8', 'ignore').split(
                    ' ')[1],
            "关注(人)":
                response.doc('.mod_box > .mod_cont > .data > .box_col').eq(2).text().encode('utf8', 'ignore').split(
                    ' ')[1],
            "户型分间": response.doc('.sub_mod_box').eq(0).text().encode('utf8', 'ignore'),
            "户型分间img": response.doc('.sub_mod_box > .mod_cont > .pictext > .mod_media > .media_main > .lazyload').attr(
                'origin-src')
        }

链家小区