爬虫获取主页信息

阿帅啊，长点心吧 / 2025-02-21 / 原文

爬虫获取主页信息

1.如何使Python获取到网页的源代码
	urllib：用来模拟浏览器
	urllib.request：获取主页源码
	urllib.request.Request()：构建数据结构 
	add_header("user-agent")：添加请求头，伪装浏览器
	urllib.request.urlopen()：打开URL获取源码
	
2.过滤

爬虫爬取主页信息

#调用函数
import urllib.request

#定义类
class GetHtml(object):
    def __init__(self,URL):		#赋值变量
        self.url = URL

    def get_index(self):		#爬取方法
        self.index = urllib.request.urlopen(self.url)
        return self.index.read()


#对象
html = GetHtml("http://www.megshuai.top:8012/")
print(html.get_index())		打印内容

//添加请求头信息

#调用函数
import urllib.request

class GetHtml(object):
    def __init__(self,URL,HEAD):
        self.url = URL
        self.head = HEAD

    def get_index(self):
        self.Agent = urllib.request.Request(self.url)	#重新构建数据结构
        self.Agent.add_header("user-agent",self.head)	#添加请求头，伪装浏览器
        self.index = urllib.request.urlopen(self.Agent)	#请求网站，获取源代码
        return self.index.read()


#lei
html = GetHtml("http://www.megshuai.top:8012/","Mozilla/5.0 (Windows NT 10.0; Win32; x64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/100.0.4240.183 Safari/537.36")
print(html.get_index())

#爬虫爬取主页信息
//下载网站所有图片
#调用函数
import urllib.request
import re

class GetHtml(object):
    def __init__(self,URL,HEAD):
        self.url = URL
        self.head = HEAD

    def get_index(self):
        self.Agent = urllib.request.Request(self.url)
        self.Agent.add_header("user-agent",self.head)
        self.index = urllib.request.urlopen(self.Agent)
        return self.index.read()

    def get_list(self):		#拼接图片地址，将地址放入列表
        self.starimage = []		#创建一个空列表
        self.imglist = re.findall(b"\w{16}.jpg",self.get_index())	#使用正则过滤出地址
        for i in self.imglist:		#将拼接的地址信息放入列表
            self.starimage.append(self.url+str(i,encoding="utf8"))
        return self.starimage

    def get_image(self):
        num = 0			#定义图片名字
        for self.url in self.get_list():	#将拼接好的图片复制给get_index
            num += 1
            with open(str(num)+".jpg","wb") as f:	#创建文件夹
                f.write(self.get_index())	#将get_index的内容存入到创建文件夹中

#lei
html = GetHtml("http://www.megshuai.top:8012/","Mozilla/5.0 \
(Windows NT 10.0; Win32; x64) AppleWebKit/537.36 (KHTML, like Gecko)\
Chrome/100.0.4240.183 Safari/537.36")
html.get_image()