写了一个简单python爬虫

项目很简单,地址在:https://github.com/qing2zijin/codess/tree/master/spider/crawl_blog

由于是非科班,我觉得还行,当然还有很多需要完善的地方,例如:加入一个URL管理模块。

2021.3.29 时隔多日,再次更新,代码如下,更简洁:


import requests, json

def spider_hand(ipage):
    headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0'}
    r = requests.post('https://www.foreverblog.cn/api/blogs', data={'page':ipage}, headers=headers)
    data = json.loads(r.content.decode('utf-8'))
    return data
    '''
    f = open('blogroll.log', 'w')
    f.write(str(json.loads(r.content.decode('utf-8'))))
    f.close()
    '''
def analysis_page(raw_data, j):
    try:
        for i in range(0, raw_data['meta']['per_page']):
            a = open('rawblogs.log', 'a', encoding='utf-8')
            a.write(
                str(i+1+j*(raw_data['meta']['per_page']))+': name:'+raw_data['data'][i]['name']+'  '
                +'message:'+ raw_data['data'][i]['message']+'  '
                +'url:'+raw_data['data'][i]['url']+'  '
                +'date:'+raw_data['data'][i]['date']+'\n'
            )
            a.close()
    except:
        pass

if __name__ == '__main__':
    data = spider_hand(0);
    total_page = data['meta']['last_page'] #int
    analysis_page(data, 0)
    print('YES')
    for j in range(1,total_page+1):
        data = spider_hand(j)
        analysis_page(data, j)
        print('YES')

2021.3.20补充:

花了一点时间,将代码优化了一下,使用面向对象方法重新了一遍,程序运行速度提高了不少,可拓展性大大加强。


import requests, json
from lxml import etree

class blog:
    #__x = 1234
    def __init__(self, header):
        self.headers = header  #头
        self.data = {}         #博客目录数据
        self.last_page = 0    #目录包含博客数量
        self.per_page = 0      #抓取目录页数
        self.message = ''      #博客介绍
        self.name = ''         #博客名称
        self.date = ''         #博客加入时间
        self.num = 1           #抓取次数
        self.raw_url = ''      #原始URL
        self.real_url = ''     #真实的URL

    def get_raw_data(self):
        try:
            r = requests.post('https://www.foreverblog.cn/api/blogs', data={'page':self.num}, headers=self.headers)
            self.data = json.loads(r.content.decode('utf-8'))
            self.per_page = self.data['meta']['per_page']
            self.last_page = self.data['meta']['last_page']
        except:
            pass

    def true_url(self):
        url_data = requests.get(self.raw_url, self.headers)
        r_x = etree.HTML(url_data.content.decode('utf-8'))
        find_url = r_x.xpath('//div[@class="cleft"]/a/@href')
        print('已经找到一个博主的真实网址!')
        self.real_url = find_url[0]
        del find_url

    def analysis_page(self):
        try:
            for i in range(0, self.per_page):
                self.raw_url = self.data['data'][i]['url']
                self.name = self.data['data'][i]['name']
                self.message = self.data['data'][i]['message']
                self.date = self.data['data'][i]['date']
                blog.true_url(self)

                b = open('blogroll.html', 'a', encoding='utf-8')
                b.write(
                '<p>['+ str(i+1+(self.num-1)*(self.per_page))
                +']  <a href="'
                +self.real_url+' '
                +'"target="blank">'+self.name
                +'</a>,'+ self.message
                +','+self.date+'</p>\n'
                )
                print('已经写入一个博客%s'%(i+1+(self.num-1)*(self.per_page)))
                b.close()
        except:
            pass
    def x_main(self):
        file = open('blogroll.html', 'w', encoding='utf-8')
        file.write(
        '<!DOCTYPE html><html lang="zh-Hans"><head><meta charset="utf-8"><title>blogroll</title><link rel="icon" href="/favicon.ico"></head><body><h1>Blogroll</h1><hr>\n'
        )
        file.close()
        blog.get_raw_data(self)  #获取目录页码
        for j in range(0, (self.last_page)+1):
            blog.get_raw_data(self)
            blog.analysis_page(self)
            self.num+=1
        filr = open('blogroll.h', 'a', encoding='utf-8')
        filr.write('</body></html>')
        filr.close()    
if __name__ == '__main__':

    headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0'}
    a = blog(headers)
    a.x_main()
    '''
    try:
        print(a.__x)
    except Exception as e:
        print(e)
        print('error')
    '''
    del a

    '''
    部分代码参考:https://www.runoob.com/python/python-object.html
    '''