python3 爬取飞G图girl13.com 图片

2023-08-30 01:14:21

python3 爬取飞G图girl13.com 图片

- 第一个版本
- 第二个版本

简介：爬取 http://www.girl13.com 图片
self.time = 2 # 设置间隔时间，默认时间为2s，以防止封IP

第一个版本

import os
import time
import requests
import threading
from bs4 import BeautifulSoupclass Girl13(object):def __init__(self):self.session = requests.session()self.headers = {"Connection": "keep-alive","User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"" AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.10 Safari/537.36"}self.time = 2  # 设置间隔时间# 获取状态def get_status(self, url):response = self.session.get(url, headers=self.headers)if response.status_code == 200:return responseelse:print("ERROR: 网络连接失败！")return False# 首页,建立连接def get_index(self, url):response = self.get_status(url)if response:# response.encoding = "utf-8"# html = response.text# print(html)print("首页,建立连接...")return Trueelse:print("ERROR: 首页访问失败！")return False# 解析def parse_html(self, url):title_url = {}response = self.get_status(url)if not response:return Nonehtml = BeautifulSoup(response.text, "html5lib")columns = html.select("#loop-square .column-post")for column in columns:title = column.select(".entry-title")[0].text if column.select(".entry-title") else Noneimg_url = column.select(".entry-content.cf img")[0].get("src") \if column.select(".entry-content.cf img") else None# print(title, img_url)if not title:continuetitle = os.path.basename(img_url)title_url[title] = img_urlreturn title_url# 获取最后一页def get_last_page(self, url):response = self.get_status(url)if not response:return Nonehtml = BeautifulSoup(response.text, "html5lib")pages = html.select(".page-navigator li > a")if pages[-1].text == "下一页":last_page = pages[-2].textelse:last_page = pages[-2].textreturn int(last_page)# 翻页@staticmethoddef next_page(last_page):for i in range(1, last_page + 1):# url = "https://www.mzitu.com/zipai/comment-page-376"url = "http://www.girl13.com/page/{}".format(i)# print(url)yield url# 下载def download(self, path, url):print(url)with open(path, "wb") as f:response = self.get_status(url)content = response.contentf.write(content)def main_(self):# 首页,建立连接url = "http://www.girl13.com"if not self.get_index(url):return None# 获取最后一页url = "http://www.girl13.com/page/1"last_page = self.get_last_page(url)if not last_page:return Nonepath = os.path.abspath(os.path.join(os.getcwd(), "image"))if not os.path.exists(path):os.mkdir(path)# 翻页urls = self.next_page(last_page)for url in urls:title_url = self.parse_html(url)thread_list = []for title in title_url:path = os.path.abspath(os.path.join(os.getcwd(), "image", title))url = title_url[title]t = threading.Thread(target=self.download, args=(path, url))thread_list.append(t)for t in thread_list:t.start()for t in thread_list:t.join()time.sleep(self.time)def main(self):t = threading.Thread(target=self.main_)t.daemon = Truet.start()t.join()if __name__ == '__main__':girl = Girl13()girl.main()

第二个版本

修复第一个版本因意外原因导致退出的问题

# -*- coding: utf-8 -*-
import os
import time
import requests
import threading
from bs4 import BeautifulSoupclass Girl13(object):def __init__(self):self.session = requests.session()self.headers = {"Connection": "keep-alive","User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"" AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.10 Safari/537.36"}self.time = 2  # 设置间隔时间def get_status(self, url):"""获取状态:param url: 访问地址:return: 如果状态码200返回response，否则返回False"""try:response = self.session.get(url, headers=self.headers)if response.status_code == 200:return responseelse:print("ERROR: 网络连接失败！")return Falseexcept requests.exceptions.ConnectionError:print("由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败")def get_index(self, url):"""首页,建立连接:param url: 首页地址，建立 session 会话链接:return: 首页可访问，返回True，否则返回False"""response = self.get_status(url)if response:# response.encoding = "utf-8"# html = response.text# print(html)print("首页,建立连接...")return Trueelse:print("ERROR: 首页访问失败！")return Falsedef parse_html(self, url):"""解析网页，获取图片地址和图片名:param url: 当前网页地址:return: 返回一个字典"""title_url = {}response = self.get_status(url)if not response:return Nonehtml = BeautifulSoup(response.text, "html5lib")columns = html.select("#loop-square .column-post")for column in columns:title = column.select(".entry-title")[0].text if column.select(".entry-title") else Noneimg_url = column.select(".entry-content.cf img")[0].get("src") \if column.select(".entry-content.cf img") else None# print(title, img_url)if not title:continuetry:title = os.path.basename(img_url)title_url[title] = img_urlexcept TypeError:print("ERROR:", img_url)return title_urldef get_last_page(self, url):"""获取最后一页的页数，用于分析出最后一页的页码:param url: 第一页的url:return: 返回 int 型页数"""response = self.get_status(url)if not response:return Nonehtml = BeautifulSoup(response.text, "html5lib")pages = html.select(".page-navigator li > a")if pages[-1].text == "下一页":last_page = pages[-2].textelse:last_page = pages[-2].textreturn int(last_page)@staticmethoddef next_page(last_page):"""进行翻页:param last_page: 传入最后一页的页数:return: yield 生成器"""for i in range(1, last_page + 1):# url = "https://www.mzitu.com/zipai/comment-page-376"url = "http://www.girl13.com/page/{}".format(i)# print(url)yield urldef download(self, path, url):"""下载图片:param path: 保存地址:param url: 图片url:return:"""# print(url)with open(path, "wb") as f:response = self.get_status(url)content = response.contentf.write(content)def main_(self):# 首页,建立连接url = "http://www.girl13.com"if not self.get_index(url):return None# 获取最后一页url = "http://www.girl13.com/page/1"last_page = self.get_last_page(url)if not last_page:return Nonepath = os.path.abspath(os.path.join(os.getcwd(), "image"))if not os.path.exists(path):os.mkdir(path)# 翻页urls = self.next_page(last_page)for url in urls:print(url)title_url = self.parse_html(url)thread_list = []for title in title_url:path = os.path.abspath(os.path.join(os.getcwd(), "image", title))url = title_url[title]t = threading.Thread(target=self.download, args=(path, url))thread_list.append(t)for t in thread_list:t.start()for t in thread_list:t.join()time.sleep(self.time)def main(self):t = threading.Thread(target=self.main_)t.daemon = Truet.start()t.join()if __name__ == '__main__':girl = Girl13()girl.main()

本文来自互联网用户投稿，文章观点仅代表作者本人，不代表本站立场，不承担相关法律责任。如若转载，请注明出处。 如若内容造成侵权/违法违规/事实不符，请点击【内容举报】进行投诉反馈！

标签：技术

上一篇 > 设计模式--备忘录模式（Memento）
下一篇 > Kotlin学习记录

Duilib中list控件支持ctrl和shif多行选中的实现

[ICML2015]Batch Normalization:Accelerating Deep Network Training by Reducing Internal Covariate Shif

win10系统微软输入法于eclipse ctrl+shif+f冲突间接处理办法

Codeforces Round #259 (Div. 2) B. Little Pony and Sort by Shif

读LDD3，内存映射与DMA--PAGE_SHIF…

VMware虚拟机安装XP【要先分区，再设置BOOT 启动CD，shif+上移】

更换iBus五笔的左与右Shif

sublime ctrl+shif+f 没用解决办法

idea 对 ctrl + z 的撤销是 ctrl + shif + z

计算机最早的设计师应用于,计算机应用基础选择题doc.doc

win10自带截图神器：Win+Shift+S

Python基础之文件目录操作

python简述目录_Python基础之文件目录操作(示例代码)

tp5 如何做数据采集

任务2-7(服务器字体+阿里巴巴矢量库)

html标签（1)：h1~h6,p,br,pre,hr

TI 电量计介绍与芯片选型指南

几款TI电源芯片简介

TI DSP芯片C2000系列读取FLASH数据

德州仪器(Ti)平台嵌入式开发基础

TI三相电机智能栅极驱动芯片特点分类

省选模拟（12.08） T3 圈圈圈圈圈圈圈圈

Hadoop生态圈技术栈（上）

大数据开发基础入门与项目实战（三）Hadoop核心及生态圈技术栈之6.Impala交互式查询

小猿圈之Linux下Mysql 操作命令

大数据Hadoop生态圈常用面试题

大数据开发基础入门与项目实战（三）Hadoop核心及生态圈技术栈之4.Hive DDL、DQL和数据操作

备战Noip2018模拟赛11（B组）T3 Monogatari 物语

【智能优化算法-圆圈搜索算法】基于圆圈搜索算法Circle Search Algorithm求解单目标优化问题附matlab代码

NYOJ 78 圈水池

递归问题跑道汽车绕圈问题 Python实现

Hadoop生态圈（三）：MapReduce

python3 爬取 飞G图girl13.com 图片

python3 爬取 飞G图girl13.com 图片

第一个版本

第二个版本

相关文章

python3 爬取飞G图girl13.com 图片

python3 爬取飞G图girl13.com 图片