打发时间

调整一下个人状态,看看古今中外的名著小说,打发时间。 用pyhone抓取了几部小说。


百年孤独.txt
福尔摩斯探案全集.txt
华盖集.txt
了不起的盖茨比.txt
挪威的森林.txt
欧亨利短篇小说.txt
文化苦旅.txt

spider_main.py


import url_manage
import oput

class Spider_Main():

    def __init__(self):
        self.urls = url_manage.urls()
        self.content = oput.get_content()

    def crawl(self, url):
        pool_url = self.urls.get_url(url)      
        for i in pool_url:
            self.content.crawl_content(i)

if __name__ == '__main__':
    url = 'http://www.mingzhuxiaoshuo.com/waiguo/120/' #目录
    obj_spider = Spider_Main()
    obj_spider.crawl(url)
# http://www.mingzhuxiaoshuo.com/jinxiandai/

url_manage.py


import urllib.request
from lxml import etree

class urls(object):
    def get_url(self, url):
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}      
        try:
            request = urllib.request.Request(url=url, headers=headers)
            data = urllib.request.urlopen(request, timeout=10).read().decode('gbk')
            node = etree.HTML(data)
            url_p = node.xpath('//div[@class="list"]/ul/li/a/@href') #获取url
            #print (url_p)
            return url_p
        except:
            print("error")

oput.py


import urllib.request
from lxml import etree
import time

class get_content(object):

    def crawl_content(self, url):
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}

        try:
            request = urllib.request.Request(url="http://www.mingzhuxiaoshuo.com"+url, headers=headers)
            data=urllib.request.urlopen(request).read().decode('gbk')
            node = etree.HTML(data)
            content_title = node.xpath('//h1/text()') #获取标题

            file = open('novel/novel.txt', 'a', encoding='utf8')
            file.write(content_title[0]+"\t"+ "http://www.mingzhuxiaoshuo.com"+ url +"\n\n")
            #time.sleep(0.3)
            content = node.xpath('//div[@class="width"]/p/text()')  
            for i in range(0, len(content)):
                print("写入中...")
                file.write(content[i]+'\n')
            file.write("\n\n")
            file.close()
        except:
            print("error")
            file = open('content.txt', 'a', encoding='utf8')
            file.write('缺失'+"\t"+"http://www.mingzhuxiaoshuo.com"+ url+ "\n\n")

怎奈世事无常