admin管理员组

文章数量:814296

获得”我要个性网“的微信头像

更新:使用了java线程池,加速爬去过程,代码连接。因为该例子中爬各个网页间互不影响,也是将程序改为并行的

--2019.12.10

===============================================

不重在爬虫,而在学习过程

# -*- coding:utf-8 -*-
import urllib2, urllib, time
from bs4 import BeautifulSoup
import sys, os
reload(sys)
sys.setdefaultencoding('utf-8') #设置输出格式def crawl(url, website = ""):img_dir = "我要个性网"if os.path.isdir(img_dir) == False:os.mkdir(img_dir)#加头部信息,模拟浏览器headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/64.0.3282.119 Chrome/64.0.3282.119 Safari/537.36'}req = urllib2.Request(url, headers=headers) #创建对象page = urllib2.urlopen(req, timeout=20) #设置超时,防止URL不可访问或相应时间慢contents = page.read() #获取源码, readline获取一行#print contentssoup = BeautifulSoup(contents, 'html.parser')alinks = soup.find_all('a', {'class':'imgTitle'})global recordfor alink in alinks:# if record < 655: #断或卡后再连接设置参数#     record += 1#     continuedirpath = img_dir + '/' + str(record).zfill(3) + '_' + alink.textprint dirpathif(alink.text.__contains__('/')):deal_error(dirpath + '\n')dirpath = img_dir + '/' + str(record).zfill(3) + '_' + alink.text.replace('/', 'or')if os.path.isdir(dirpath) == False:os.mkdir(dirpath)suburl = website + alink.get('href')#print suburlsubreq = urllib2.Request(suburl, headers=headers)subpage = urllib2.urlopen(subreq, timeout=20)subcontents = subpage.read()# if record == 1:#     print subcontentssubsoup = BeautifulSoup(subcontents, 'html.parser')imgs = subsoup.find_all('img', {'class':'lazy'})cur = 0for img in imgs:cur += 1link = img.get('src')#print linkfilename = dirpath + '/%02d.jpg'%curprint filenametry:urllib.urlretrieve(link, filename) #下载并保存到images文件夹except:deal_error(filename + "\n" + link + "\n")record += 1def deal_error(string):fout = open("log_error.txt", "at")fout.write(string)fout.close()record = 1
url = '.html' 
website = ''
crawl(url, website)
pageNum = 1
while (True): pageNum += 1print "请求第==================================================%d===================页" % pageNumurl = '.html' % pageNumcrawl(url, website)#遇到的问题 Connection reset by peer
#Temporary failure in name resolution
#最终会404 NOT FOUND异常终止程序

python3代码

# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import requests, os, threading, reheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)\AppleWebKit/537.36 (KHTML, like Gecko)\Chrome/69.0.3497.100 Safari/537.36'}def loge(msg):with open('error_log.txt', 'at+') as fout:try:fout.write(msg)except:fout.write('Warning: 编码错误')def save_img(url, path):with open(path, 'wb') as fout:response = requests.get(url, headers).contentfout.write(response)def spider(url, website=''):path = os.path.join(os.getcwd(), 'images')if not os.path.exists(path):os.mkdir(path)response = requests.get(url, headers=headers).contentsoup = BeautifulSoup(response, 'html.parser')divs = soup.select('.txList')next_page = soup.find('div', {'class':'page'})for div in divs:try:title = re.sub('[\/:*?"<>|\n.]', '_', div.a.get('title'))dir_name = os.path.join(path, title)if not os.path.exists(dir_name):os.mkdir(dir_name)except:loge('Error: ' + str(div))continueresponse = requests.get(website + str(div.a.get('href'))).contentsoup = BeautifulSoup(response, 'html.parser')lis = soup.select('.tx-img')for li in lis:img_url = 'http:' + li.a.get('href')file_path = os.path.join(dir_name, img_url.split('/')[-1])thread = threading.Thread(target=save_img, args=(img_url, file_path))thread.start()print(os.getpid(), url)if next_page:next_url = website + str(next_page.findAll('a')[-1].get('href'))thread = threading.Thread(target=spider, args=(next_url, website))thread.start()def main():website = ''url = '/'# index_40后网页结构变了spider(url, website)if __name__ == '__main__':main()

 

本文标签: 获得”我要个性网“的微信头像