首页 > GAME > 游戏 > 正文

咸阳凳巡诩信息技术有限公司,亳州怕幼信息科技有限公司,营口脖娜美容美发化妆学校

风沙接近,骆驼明显有些慌乱,这种常年行走在沙漠中的大家伙最是清楚沙暴的厉害,商队随行用手拉紧缰绳,取出黑布将骆驼眼睛蒙上,这种方式用来去除骆驼恐慌最是有效,身子尽量弯下,指挥驼队直奔绿洲方向奔去。

真还赚智能管家官网

不多时,柜坊大堂内的灯亮了,店内传来掌柜宋义的笑声,“张掌柜,怎么半夜来存钱?”
这个晚宴就像是一个小型的酒会,里面有着数百名男男女女,这些人在今天下午的晋升仪式上也是出现过,叶扬并不知道他们是何人。

根本就不是单纯的得到了千手柱间的细胞的秽土转生出来的宇智波斑,大和等人释放出来的能比,无论是威力还是气势,亦或者是生命气息都和刘皓还有纲手施展出来的树界降临相比都不过是小巫见大巫,萤火虫跟皓月的相比的差距。

scrapy下载图片到指定目录,创建缩略图,存储入库


环境和工具:python2.7,scrapy

实验网站:http://www.27270.com/tag/333.html  爬去所有兔女郎图片,下面的推荐需要过滤

逻辑:分析网站信息,下载图片和入库需要开启ITEM_PIPELINES,开启缩略图配置,转移图片

 -----settings.py

##不按照robots.txt
ROBOTSTXT_OBEY = False
##默认
DOWNLOAD_DELAY = 3
##关闭cookie
COOKIES_ENABLED = False
##开启ITEM_PIPELINES
ITEM_PIPELINES = {
                    "MyPicSpider.pipelines.MyImagesPipeline": 300,
                    "MyPicSpider.pipelines.MysqlPipeline": 400
                  }
##存储路径
IMAGES_STORE ="G:\www\scrapy_rpo\pic\meinv\rabbit\"
##过滤图片
IMAGES_MIN_HEIGHT = 110
IMAGES_MIN_WIDTH = 110
##缩略图片
IMAGES_THUMBS = {
    "big": (270, 270),
}

------items.py

import scrapy


class PicspiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    tag = scrapy.Field()
    image_urls = scrapy.Field()
    images_data = scrapy.Field()
    img_path = scrapy.Field()
    img_big_path = scrapy.Field()
    file_path = scrapy.Field()

----pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don"t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html


import scrapy,os,datetime
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
import shutil,os,pymysql
# 导入项目设置
from scrapy.utils.project import get_project_settings
#conn = pymysql.Connection(host="localhost", user="root", passwd="root", db="test", charset="UTF8")
#cursor = conn.cursor()
class MyImagesPipeline(ImagesPipeline):
    # 从项目设置文件中导入图片下载路径
    img_store = get_project_settings().get("IMAGES_STORE")
    def get_media_requests(self, item, info):
        """ 多个url"""
        for image_url in item["image_urls"]:
            yield scrapy.Request(image_url)

    def item_completed(self, results, item, info, ):
        image_paths = [x["path"] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        file_path = item["file_path"]
        # 定义分类保存的路径
        if os.path.exists(file_path) == False:
            os.mkdir(file_path)
        print image_paths
        ## pic  ==  full/80dd7db02e4da4e63f05d9d49c1092fc7fdcb43e.jpg
        pic_list = []
        for v in image_paths:
            pic_name = v.replace("full/","")
            pic_small_name =pic_name.replace(".jpg","")+"_s.jpg"
            pic_big_name = pic_name.replace(".jpg", "") + "_b.jpg"
            ##获取创建的图片名字
            # 将文件从默认下路路径移动到指定路径下
            # 移动图片
            shutil.move(self.img_store + "full\"+pic_name, file_path + "\" + pic_name)
            # 移动缩略图
            #shutil.move(self.img_store + "thumbs\small\"+ pic_name, file_path + "\" + pic_small_name)
            shutil.move(self.img_store + "thumbs\big\" + pic_name, file_path + "\" + pic_big_name)
            #img_path_dict["img_path"] = file_path + "\" + pic_name
            #img_path_dict["img_small_path"] = file_path + "\" + pic_small_name
            #img_path_dict["img_big_path"] = file_path + "\" + pic_big_name
            img_path_dict = ("picture/meinv/rabbit/"+item["tag"]+"/" + pic_name,"picture/meinv/rabbit/"+item["tag"]+"/" +pic_big_name)
            pic_list.append(img_path_dict)
        item["img_path"] = pic_list
        return item

##入库
class MysqlPipeline(object):
    def __init__(self):
        self.conn = pymysql.Connection(host="localhost", user="root", passwd="root", db="test1", charset="UTF8")
        # 创建指针
        self.cursor = self.conn.cursor()
    def process_item(self, item, spider):
        ###组装数据
        list = []
        datetime_now  =datetime.datetime.now()
        datetime_now = datetime.datetime.now()
        datetime_str = "{0}-{1}-{2} {3}:{4}:{5}".format(datetime_now.year, datetime_now.month, datetime_now.day,datetime_now.hour, datetime_now.minute, datetime_now.second)
        ##增加type
        result = self.cursor.execute(u"select id from network_type where RESOURCETYPE ="p" and TYPENAME="{0}"".format(item["tag"]))
        if result==0:
            self.cursor.execute("insert into network_type(PID,RESOURCETYPE,TYPENAME)values(%s,%s,%s) ",(2415,"p",item["tag"]))
            typeid = self.cursor.lastrowid
            self.conn.commit()
        else:
            #tag_id = self.cursor.fetchall()
            #typeid = tag_id[0][0]
            return False

        types = ","+str(typeid)+","
        #print item["img_path"]
        self.cursor.execute("select  id from network_picture order by cast(id as SIGNED INTEGER) desc limit 0,1")
        old_id = self.cursor.fetchone()
        if old_id:
            id_n = str(int(old_id[0]) + 1)
        else:
            id_n = str(1)
        for v in item["img_path"]:
            path1 = v[0]
            path2 = v[1]
            self.cursor.execute(u"select  id from network_picture where FILEPATH="{0}" and fileScalPath="{1}"".format(path1,path2))
            data = self.cursor.fetchone()
            if data:
                print u"该数据已经存在"
            else:
                a = (str(id_n),"",path1,"",types,0,datetime_str,path2)
            list.append(a)
            id_n = int(id_n) + 1
        print list
        self.cursor.executemany("insert into network_picture(ID,NAME,FILEPATH,FILESIZE,TYPES,STATUS,DATETIME,fileScalPath)values(%s,%s,%s,%s,%s,%s,%s,%s)", list)
        self.conn.commit()
        return item

----spider.py

# -*- coding: utf-8 -*-
import scrapy,os,urllib2
from scrapy.linkextractors import LinkExtractor   ##引入linkextractors  用于筛选链接和跟进链接,还有很多功能,可以去百度下
from scrapy.spiders import CrawlSpider, Rule     ##定义spider的模板,引入Rule规则
from MyPicSpider.items import PicspiderItem      ##引入定义的items.py
# 导入项目设置
from scrapy.utils.project import get_project_settings
from bs4 import BeautifulSoup
import time,pymysql
headers = {"User_agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"}
conn = pymysql.Connection(host="localhost", user="root", passwd="root", db="test1", charset="UTF8")
# 创建指针
cursor = conn.cursor()
class PicSpider(CrawlSpider):    ##继承模板CrawlSpider 普通模板继承Spider
    name = "pic"     ###定义spider名    运行---$ scrapy crawl blog
    allowed_domains = ["www.27270.com"]    ##  定义查找范围
    start_urls = ["http://www.27270.com/tag/333.html"]   ###初始url
    ####当有follow=True  则会跟进该页面
    ####原理就是  spider在初始页面查找,同时查找帖子详情页的url和下一个分页,同时跟进下一个分页页面,继续查找下一个分页页面和上面的详情页url,详情页面使用回调函数进行采集
    rules = (
        ###爬去索引页并跟踪其中链接
        ###查找start_urls  所有的分页页面
        Rule(LinkExtractor(allow=r"/tag/[0-9]*_[0-9]*.html"),follow=True),
        ###爬去items页面并将下载响应返回个头parse_item函数
        ####查询每个分页页面的详情页
        Rule(LinkExtractor(allow=r"http://www.27270.com/ent/[a-z]*/[0-9]*/[0-9]*.html"), callback="parse_item", follow=False,),
        #Rule(LinkExtractor(allow=r"http://www.27270.com/zhuangxiusheji/[0-9]*/[0-9]*.html"), callback="parse_item", follow=False),
    )
    ####详情页面回调函数
    def parse_item(self,response):
        start_url = response.url
        item = PicspiderItem()
        tag_name = response.xpath("//h1[@class="articleV4Tit"]/text()").extract()[0]
        # cursor.execute(u"select id from network_type  where PID=258 AND TYPENAME="{0}" limit 0,1".format(tag_name))
        # old_id = cursor.fetchone()
        # if old_id:
        #     exit()
        name = u"人体"
        if name in tag_name:
            pass
        else:
            print u"----这是其他的分类----"
            return False
        li_list =  response.xpath("//ul[@class="articleV4Page l"]/li").extract()
        srcs = []
        for v in range(1, (len(li_list) - 3)):
            if v == 1:
                url_s = start_url
            else:
                url_s = start_url.replace(".html", "") + "_" + str(v) + ".html"
            try:
                request = urllib2.Request(url_s, headers=headers)
                response = urllib2.urlopen(request, timeout=200).read()
            except urllib2.URLError, err:
                print err, "错误的url" + url
            obj = BeautifulSoup(response, "html.parser")
            try:
                pic_url = obj.find("center").find("img")["src"]
            except:
                print u"----第一种获取方式失败----"
                try:
                    pic_url = obj.find("div", {"id": "picBody"}).find("img")["src"]
                except:
                    print u"----第二种方式获取失败----"
                    try:
                        pic_url = obj.find("p", attrs={"style": "text-align: center"}).find("img")["src"]
                    except:
                        print u"----第三种获取方式失败----"
            srcs.append(pic_url)
        item["tag"] = tag_name
        item["file_path"] = "%s%s" %(get_project_settings().get("IMAGES_STORE"),tag_name)
        item["image_urls"] = srcs
        return item

------scrapy的去重方面我还不是特别了解,有知道的大佬可以告知本白,谢谢。

当前文章:http://mabebox.com/array/s8lbindjfh.html

发布时间:2018-11-16 00:00:00

怎么样可以赚到钱 什么是暴利赚钱思维 期货有本事的自己赚钱 真正的挂机赚钱软件 加粉赚钱平台 七日杀A16怎么赚钱 土豪金5s多少钱 新赚宝是不是骗子

编辑:纯秉


声明:所发布的内容均来源于互联网,目的在于传递信息,但不代表本站赞同其观点及立场,版权归属原作者,如有侵权请联系删除。
如何通过互联网赚钱

日赚50-800

网络写手如何赚钱