python3 爬取 飞G图girl13.com 图片

python3 爬取 飞G图girl13.com 图片

    • 第一个版本
    • 第二个版本

简介:爬取 http://www.girl13.com 图片
self.time = 2 # 设置间隔时间,默认时间为2s,以防止封IP

第一个版本

import os
import time
import requests
import threading
from bs4 import BeautifulSoupclass Girl13(object):def __init__(self):self.session = requests.session()self.headers = {"Connection": "keep-alive","User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"" AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.10 Safari/537.36"}self.time = 2  # 设置间隔时间# 获取状态def get_status(self, url):response = self.session.get(url, headers=self.headers)if response.status_code == 200:return responseelse:print("ERROR: 网络连接失败!")return False# 首页,建立连接def get_index(self, url):response = self.get_status(url)if response:# response.encoding = "utf-8"# html = response.text# print(html)print("首页,建立连接...")return Trueelse:print("ERROR: 首页访问失败!")return False# 解析def parse_html(self, url):title_url = {}response = self.get_status(url)if not response:return Nonehtml = BeautifulSoup(response.text, "html5lib")columns = html.select("#loop-square .column-post")for column in columns:title = column.select(".entry-title")[0].text if column.select(".entry-title") else Noneimg_url = column.select(".entry-content.cf img")[0].get("src") \if column.select(".entry-content.cf img") else None# print(title, img_url)if not title:continuetitle = os.path.basename(img_url)title_url[title] = img_urlreturn title_url# 获取最后一页def get_last_page(self, url):response = self.get_status(url)if not response:return Nonehtml = BeautifulSoup(response.text, "html5lib")pages = html.select(".page-navigator li > a")if pages[-1].text == "下一页":last_page = pages[-2].textelse:last_page = pages[-2].textreturn int(last_page)# 翻页@staticmethoddef next_page(last_page):for i in range(1, last_page + 1):# url = "https://www.mzitu.com/zipai/comment-page-376"url = "http://www.girl13.com/page/{}".format(i)# print(url)yield url# 下载def download(self, path, url):print(url)with open(path, "wb") as f:response = self.get_status(url)content = response.contentf.write(content)def main_(self):# 首页,建立连接url = "http://www.girl13.com"if not self.get_index(url):return None# 获取最后一页url = "http://www.girl13.com/page/1"last_page = self.get_last_page(url)if not last_page:return Nonepath = os.path.abspath(os.path.join(os.getcwd(), "image"))if not os.path.exists(path):os.mkdir(path)# 翻页urls = self.next_page(last_page)for url in urls:title_url = self.parse_html(url)thread_list = []for title in title_url:path = os.path.abspath(os.path.join(os.getcwd(), "image", title))url = title_url[title]t = threading.Thread(target=self.download, args=(path, url))thread_list.append(t)for t in thread_list:t.start()for t in thread_list:t.join()time.sleep(self.time)def main(self):t = threading.Thread(target=self.main_)t.daemon = Truet.start()t.join()if __name__ == '__main__':girl = Girl13()girl.main()

第二个版本

修复第一个版本因意外原因导致退出的问题

# -*- coding: utf-8 -*-
import os
import time
import requests
import threading
from bs4 import BeautifulSoupclass Girl13(object):def __init__(self):self.session = requests.session()self.headers = {"Connection": "keep-alive","User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"" AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.10 Safari/537.36"}self.time = 2  # 设置间隔时间def get_status(self, url):"""获取状态:param url: 访问地址:return: 如果状态码200返回response,否则返回False"""try:response = self.session.get(url, headers=self.headers)if response.status_code == 200:return responseelse:print("ERROR: 网络连接失败!")return Falseexcept requests.exceptions.ConnectionError:print("由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败")def get_index(self, url):"""首页,建立连接:param url: 首页地址,建立 session 会话链接:return: 首页可访问,返回True,否则返回False"""response = self.get_status(url)if response:# response.encoding = "utf-8"# html = response.text# print(html)print("首页,建立连接...")return Trueelse:print("ERROR: 首页访问失败!")return Falsedef parse_html(self, url):"""解析网页,获取图片地址和图片名:param url: 当前网页地址:return: 返回一个字典"""title_url = {}response = self.get_status(url)if not response:return Nonehtml = BeautifulSoup(response.text, "html5lib")columns = html.select("#loop-square .column-post")for column in columns:title = column.select(".entry-title")[0].text if column.select(".entry-title") else Noneimg_url = column.select(".entry-content.cf img")[0].get("src") \if column.select(".entry-content.cf img") else None# print(title, img_url)if not title:continuetry:title = os.path.basename(img_url)title_url[title] = img_urlexcept TypeError:print("ERROR:", img_url)return title_urldef get_last_page(self, url):"""获取最后一页的页数,用于分析出最后一页的页码:param url: 第一页的url:return: 返回 int 型页数"""response = self.get_status(url)if not response:return Nonehtml = BeautifulSoup(response.text, "html5lib")pages = html.select(".page-navigator li > a")if pages[-1].text == "下一页":last_page = pages[-2].textelse:last_page = pages[-2].textreturn int(last_page)@staticmethoddef next_page(last_page):"""进行翻页:param last_page: 传入最后一页的页数:return: yield 生成器"""for i in range(1, last_page + 1):# url = "https://www.mzitu.com/zipai/comment-page-376"url = "http://www.girl13.com/page/{}".format(i)# print(url)yield urldef download(self, path, url):"""下载图片:param path: 保存地址:param url: 图片url:return:"""# print(url)with open(path, "wb") as f:response = self.get_status(url)content = response.contentf.write(content)def main_(self):# 首页,建立连接url = "http://www.girl13.com"if not self.get_index(url):return None# 获取最后一页url = "http://www.girl13.com/page/1"last_page = self.get_last_page(url)if not last_page:return Nonepath = os.path.abspath(os.path.join(os.getcwd(), "image"))if not os.path.exists(path):os.mkdir(path)# 翻页urls = self.next_page(last_page)for url in urls:print(url)title_url = self.parse_html(url)thread_list = []for title in title_url:path = os.path.abspath(os.path.join(os.getcwd(), "image", title))url = title_url[title]t = threading.Thread(target=self.download, args=(path, url))thread_list.append(t)for t in thread_list:t.start()for t in thread_list:t.join()time.sleep(self.time)def main(self):t = threading.Thread(target=self.main_)t.daemon = Truet.start()t.join()if __name__ == '__main__':girl = Girl13()girl.main()


本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

相关文章

立即
投稿

微信公众账号

微信扫一扫加关注

返回
顶部