[原创]Python Scrapy爬取美女图片

原创文章,转载请注明: 转载自勤奋的小青蛙
本文链接地址: [原创]Python Scrapy爬取美女图片

很早之前写过的一个爬虫,今天收拾电脑,觉得代码还是有参考价值的,发出来,仅供参考。

首先是对象定义代码:

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class PiaoliangmmItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
	title = scrapy.Field()
	link = scrapy.Field()
	filename = scrapy.Field()
	description = scrapy.Field()
	category = scrapy.Field()

然后是爬虫代码:

#-*-coding:utf-8-*- 
import scrapy
import re
import string
from scrapy.selector import Selector

from piaoliangmm.items import PiaoliangmmItem

class PiaoLiangmmSpider(scrapy.Spider):
	name = "piaoliangmm"
	allowed_domains = ["mmonly.cc"]
	start_urls = [
		"http://www.mmonly.cc/glamour/",
		"http://www.mmonly.cc/beauty/",
		"http://www.mmonly.cc/photo/",
		"http://www.mmonly.cc/korea/",
		"http://www.mmonly.cc/beautyleg/",
		"http://www.mmonly.cc/cosplay/",
		"http://www.mmonly.cc/jiepaimeinv/"
	]
	
	def parse(self, response):
		for list in response.xpath("//div[@class='c s_li zxgx_list l']/ul/li/a"):
			url = list.xpath("@href").extract()[0]
			#print url
			#print the url in the first page
			#print url
			#call the url
			yield scrapy.Request(url, callback = self.parse_item_contents, dont_filter = True)
		
		#handle the next page
		nextpage = response.xpath("//div[@class='pages c mt5']/ul/li[last()-1]/a/text()").extract()[0]
		if (nextpage == unicode('下一页','utf-8')):
			#print response.xpath("//div[@class='pages c mt5']/ul/li[last()-1]/a").xpath("@href").extract()[0]
			#print response.url.split('/')[-3]
			#print response.url.split('/')[-2]
			inner_url = "http://" + response.url.split('/')[-3] + "/" + response.url.split('/')[-2] + "/" + response.xpath("//div[@class='pages c mt5']/ul/li[last()-1]/a").xpath("@href").extract()[0]
			#print inner_url
			yield scrapy.Request(inner_url, callback = self.parse_next_page)
			
	
	def parse_item_contents(self, response):
		#print "parse_item_contents" + response.url
		page_text = response.xpath("//div[@class='pages c mt5']/ul/li[1]/a/text()").extract()[0]
		page_temp = re.findall(r'(\w*[0-9]+)\w*', page_text)[0]
		page_num = string.atoi(page_temp)
		
		#for test
		#page_num = 3
		
		
		for i in range(1, page_num + 1):
			#print "i = " + str(i)
			if (i == 1):
				next_url = response.url
			else:
				next_url = "http://" + response.url.split('/')[2] + "/" + response.url.split('/')[3] + "/" +response.url.split('/')[-1].split('.')[0] + "_" + str(i) + ".html"
			#print "next_url = " + next_url
			yield scrapy.Request(next_url, callback = self.parse_image_page, dont_filter = True)
		
	
	def parse_image_page(self, response):
		#print "test = " + response.url
		mm = PiaoliangmmItem()
		title = response.xpath("//div[@class='atc_tit mt10']/h1/text()").extract()[0]
		if title:
			p = re.compile(r'\(.*?\)')
			mm['title'] = p.sub("", title)
			#print mm['title']
		
		link = response.xpath("//div[@id='picView']/p[@align='center']/a/img").xpath("@src").extract()[0]
		if link:
			mm['link'] = link
			#print mm['link']
		
		filename = response.url.split('/')[-1] .split('.')[0]+ '.jpg'
		if filename:
			mm['filename'] = filename
		
		description = response.xpath("//div[@class='mt5 art_txt']/h2/text()").extract()[0]
		if description:
			mm['description'] = description
			#print mm['description']

		category = response.xpath("//div[@class='position mt10']/a[2]/text()").extract()[0]
		if category:
			mm['category'] = category
		
		return mm
	
	def parse_next_page(self, response):
		#print "come parse_next_page"
		for list in response.xpath("//div[@class='c s_li zxgx_list l']/ul/li/a"):
			url = list.xpath("@href").extract()[0]
			#print the url in the first page
			#print url
			#call the url
			yield scrapy.Request(url, callback = self.parse_item_contents, dont_filter = True)
		
		#handle the next page
		nextpage = response.xpath("//div[@class='pages c mt5']/ul/li[last()-1]/a/text()").extract()[0]
		if (nextpage == unicode('下一页','utf-8')):
			print response.xpath("//div[@class='pages c mt5']/ul/li[last()-1]/a").xpath("@href").extract()[0]
			#print response.url.split('/')[-3]
			#print response.url.split('/')[-2]
			inner_url = "http://" + response.url.split('/')[-3] + "/" + response.url.split('/')[-2] + "/" + response.xpath("//div[@class='pages c mt5']/ul/li[last()-1]/a").xpath("@href").extract()[0]
			#print inner_url
			yield scrapy.Request(inner_url, callback = self.parse_next_page, dont_filter = True)

其次是管道处理代码:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from twisted.enterprise import adbapi
import MySQLdb
import MySQLdb.cursors
import urllib
import sys
import os
from piaoliangmm.items import PiaoliangmmItem

class PiaoliangmmPipeline(object):
	def __init__(self):
		self.dbpool = adbapi.ConnectionPool('MySQLdb',host='127.0.0.1',db='mmpic',user='root',passwd='root',cursorclass=MySQLdb.cursors.DictCursor,charset='utf8',use_unicode=True)

	def process_item(self, item, spider):
		#if (os.path.exists(r'F:/mmonlypic/' + item['title'])==False):
			#os.makedirs(r'F:/mmonlypic/' + item['title'])
		#if (item['filename']):
			#local = "F:/mmonlypic/" + item['title'] + "/" + item['filename']
			#print "file = " + local
			#urllib.urlretrieve(item['link'], local)
			#query = self.dbpool.runInteraction(self._conditional_insert, item)
		query = self.dbpool.runInteraction(self._conditional_insert, item)
		return item
		
	def _conditional_insert(self, tx, item):
		#sql = "insert into mmonly (title, link, filename, description) values (%s, %s, %s, %s)"
		sql = "update mmonly set category = %s where link = %s"
		#tx.execute(sql, (item['title'], item['link'], item['filename'], item['description']))
		tx.execute(sql, (item['category'], item['link']))

再者是配置脚本:

# -*- coding: utf-8 -*-

# Scrapy settings for piaoliangmm project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'piaoliangmm'

SPIDER_MODULES = ['piaoliangmm.spiders']
NEWSPIDER_MODULE = 'piaoliangmm.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'piaoliangmm (+http://www.yourdomain.com)'

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS=32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY=3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN=16
#CONCURRENT_REQUESTS_PER_IP=16

# Disable cookies (enabled by default)
#COOKIES_ENABLED=False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED=False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'piaoliangmm.middlewares.MyCustomSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'piaoliangmm.middlewares.MyCustomDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'piaoliangmm.pipelines.PiaoliangmmPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# NOTE: AutoThrottle will honour the standard settings for concurrency and delay
#AUTOTHROTTLE_ENABLED=True
# The initial download delay
#AUTOTHROTTLE_START_DELAY=5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY=60
# Enable showing throttling stats for every response received:
AUTOTHROTTLE_DEBUG=False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED=True
#HTTPCACHE_EXPIRATION_SECS=0
#HTTPCACHE_DIR='httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES=[]
#HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
原创文章,转载请注明: 转载自勤奋的小青蛙
本文链接地址: [原创]Python Scrapy爬取美女图片

文章的脚注信息由WordPress的wp-posturl插件自动生成



|2|left
打赏

发表评论

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen: