月牙博客 专注于“胡说八道”。

viewsource查看网页程序

发布时间:2022-12-15 13:55:00  类别:life

前言

view source 即为查看网页源代码。

程序

import urllib.request as UR
import random, time, re
from lxml import etree
import urllib.parse as urlParse
import sys
#import json
import os

def delay(n=1):
   time.sleep(random.random()*n)
   return None

#isPc: 是否是PC端 有的时候,random_headers函数可以不用使用,headers单独写。
def random_headers(isPc,cookie=None): 
    if isPc:
        ua = [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36",                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2919.83 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36 OPR/77.0.4054.277"] #5
    else:
        ua = [ "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1","Mozilla/5.0 (iPad; CPU OS 13_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/87.0.4280.77 Mobile/15E148 Safari/604.1","Mozilla/5.0 (Linux; Android 8.0.0; SM-G955U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Mobile Safari/537.36","Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1","Mozilla/5.0 (Linux; Android 10; HarmonyOS; FRL-AN00a; HMSCore 6.8.0.332) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.105 HuaweiBrowser/12.1.4.304 Mobile Safari/537.36"]#5
    if cookie == None:
        headers = {"User-Agent": random.choice(ua)}
    else:
        headers = {"User-Agent": random.choice(ua), "cookie":cookie}

    return headers

def get_htmlData(url, Method, decodel=None, headers=None, encodel=None, data_dic=None, cnt=None):
    if cnt == None:
        cnt = 3
    if decodel == None:
        decodel = "utf-8"
    data = None
    out_time = 5
    n = 1
    while True:
        if n>=cnt:
            break
        try:
            if n > 1:            
                print("第%s次请求开始%s"%(n, url))
            if Method == "POST":
                if data_dic == None:
                    print("data_dic is empty !")
                    break
                if encodel == None:
                    encodel = "utf-8"
                formdata = urlParse.urlencode(data_dic).encode(encodel)
                response = UR.Request(url=url, headers=headers, method=Method)
                pre_data = UR.urlopen(response, formdata, timeout=out_time)   #POST
                charset = pre_data.info().get_param("charset")
                if charset != None:
                    decodel = charset
            elif Method == "GET":
                request  = UR.Request(url=url, headers=headers, method=Method)  #GET
                pre_data = UR.urlopen(request, timeout=out_time)
                charset = pre_data.info().get_param("charset")
                if charset != None:
                    decodel = charset
            else:
                print("目前只支持GET和POST这两种方式")
                break
            data = pre_data.read().decode(decodel,"ignore") #
        except Exception as error:
            print(error)
        #循环判断
        if data != None:
            break
        else:
            out_time += 2
            n += 1
            delay(1)
    return data

def html_lxml(data, xpath):
    if data != None:
        node = etree.HTML(data)
        item = node.xpath(xpath)
        return item
    else:
        return None

def html_re(data, pattern, isList, num=0):
    info = re.compile(pattern, re.S)
    if data == None:
        return None
    if isList:
        item = info.findall(data)
        return item
    else:
        item = info.search(data)
        if item != None:
            return item.group(num)

def File_IO(file,Mode,content=None,encodel=None):
    if os.path.isfile(file) == False:
        return
    if encodel == None:
        encodel = "utf-8"
    if Mode == "r" or Mode=="R":
        with open(file,Mode,encoding=encodel) as rfile:
            content = rfile.read()
        rfile.close()
        return content
    else:
        if content == None:
            print("数据为空!")
            return
        with open(file,Mode,encoding=encodel) as ofile:
            ofile.write(content)
        ofile.close()
        return

def view_source(base_url,url, Method, decodel, encodel, cnt=2):
    data = None
    headers = random_headers(True)
    data1 = get_htmlData(base_url, "GET", decodel=decodel, headers=headers, encodel=encodel, cnt=cnt)
    xpath = '//input[@id="vps_token"]/@value'
    if data1 == None:
        return
    req1 = html_lxml(data1, xpath)
    if req1 == None:
        return
    data_dic = {"vps_token":req1[0], "uri": url}
    pre_data = get_htmlData(base_url, Method, decodel, headers, encodel, data_dic,cnt=cnt)
    xpath_data = '//textarea[@id="rawcode"]/text()'
    lst_data = html_lxml(pre_data, xpath_data)
    try:
        data  = lst_data[0].replace("&lt;", "<").replace("&gt;", ">")
    except Exception as err:
        data = None
        print(err,base_url)
    return data

def view_source2(base_url, url,Method,decodel,encodel, cnt=2):
    data = None
    data_dic = {"url":url}
    headers = random_headers(True)
    pre_data = get_htmlData(base_url,Method,decodel,headers,encodel,data_dic,cnt=cnt)
    xpath_data = '//body/pre/code/text()'
    try:
        data_lst = html_lxml(pre_data,xpath_data)
        data = data_lst[0].replace("&lt;","<").replace("&gt;",">")
    except Exception as err:
        data = None
        print(err,base_url)
    return data

# https://www.codebeautify.com/URLService
def view_source3(base_url, url, Method, decodel, encodel, cnt=2):
    data = None
    data_dic = {"path":url}
    headers1 = random_headers(True)
    headers = {
        # ":authority":"www.codebeautify.com",
        # ":method": "POST",
        # ":path":"/URLService",
        # ":scheme": "https",
        # "accept": "*/*",
        # "accept-encoding": "gzip, deflate, br",
        # "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
        # "content-length": "28",
        # "content-type": "application/x-www-form-urlencoded",
        "origin": "https://codebeautify.org",
        "referer": "https://codebeautify.org/",
        # "sec-ch-ua": '"Not?A_Brand";v="8", "Chromium";v="108", "Microsoft Edge";v="108"',
        # "sec-ch-ua-mobile": "?0",
        # "sec-ch-ua-platform": "Windows",
        # "sec-fetch-dest": "empty",
        # "sec-fetch-mode": "cors",
        # "sec-fetch-site": "cross-site",
        "user-agent":headers1["User-Agent"]
    }

    data = get_htmlData(base_url, Method, decodel, headers, encodel, data_dic, cnt=cnt)
    return data

def all_viewSource(url,Method,decodel,encodel,cnt=2):
    base_url  = ["https://www.view-page-source.com/","https://neatnik.net/view-source/","https://www.codebeautify.com/URLService"]
    data = None
    n = 1
    while True:
        n += 1
        if n >cnt:
            break
        if Method == "POST":
            num = random.random()
            if num <= 0.4:
                data = view_source(base_url[0],url,Method,decodel,encodel,cnt)
            elif num <= 0.6:
                data = view_source2(base_url[1],url,Method,decodel,encodel,cnt)
            elif num <= 1.0:
                data = view_source3(base_url[2],url,Method,decodel,encodel,cnt)
            else:
                pass
        else:
            headers = random_headers(False)
            data = get_htmlData(url, Method, decodel, headers, encodel, cnt=cnt)
        if data != None:
            break
    return data

利用

上面的程序是基于三个位于国外的网站: 123

位于国外,那么就可以用于获取外部信息了,找一些资料就很方便了。先利用谷歌搜索信息链接,再利用该Python程序下载下来。

谷歌镜像链接分享:

https://googe.fun/
https://googlehnzyc.azurewebsites.net/
https://www.qinai.ml/
https://xn--flw351e.ml/
https://note.cm/
https://search.ahau.cf/
https://search.aust.cf/
https://search.ahau.cf/
https://gogogle.cf/
https://s.iit.xyz/
https://goo.xbzxs.org/
https://ge1.azurewebsites.net/

推特镜像:https://www.sotwe.com

Contact Info.

留言: 123guestbook