前言
view source 即为查看网页源代码。
程序
import urllib.request as UR
import random, time, re
from lxml import etree
import urllib.parse as urlParse
import sys
#import json
import os
def delay(n=1):
time.sleep(random.random()*n)
return None
#isPc: 是否是PC端 有的时候,random_headers函数可以不用使用,headers单独写。
def random_headers(isPc,cookie=None):
if isPc:
ua = [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2919.83 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36 OPR/77.0.4054.277"] #5
else:
ua = [ "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1","Mozilla/5.0 (iPad; CPU OS 13_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/87.0.4280.77 Mobile/15E148 Safari/604.1","Mozilla/5.0 (Linux; Android 8.0.0; SM-G955U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Mobile Safari/537.36","Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1","Mozilla/5.0 (Linux; Android 10; HarmonyOS; FRL-AN00a; HMSCore 6.8.0.332) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.105 HuaweiBrowser/12.1.4.304 Mobile Safari/537.36"]#5
if cookie == None:
headers = {"User-Agent": random.choice(ua)}
else:
headers = {"User-Agent": random.choice(ua), "cookie":cookie}
return headers
def get_htmlData(url, Method, decodel=None, headers=None, encodel=None, data_dic=None, cnt=None):
if cnt == None:
cnt = 3
if decodel == None:
decodel = "utf-8"
data = None
out_time = 5
n = 1
while True:
if n>=cnt:
break
try:
if n > 1:
print("第%s次请求开始%s"%(n, url))
if Method == "POST":
if data_dic == None:
print("data_dic is empty !")
break
if encodel == None:
encodel = "utf-8"
formdata = urlParse.urlencode(data_dic).encode(encodel)
response = UR.Request(url=url, headers=headers, method=Method)
pre_data = UR.urlopen(response, formdata, timeout=out_time) #POST
charset = pre_data.info().get_param("charset")
if charset != None:
decodel = charset
elif Method == "GET":
request = UR.Request(url=url, headers=headers, method=Method) #GET
pre_data = UR.urlopen(request, timeout=out_time)
charset = pre_data.info().get_param("charset")
if charset != None:
decodel = charset
else:
print("目前只支持GET和POST这两种方式")
break
data = pre_data.read().decode(decodel,"ignore") #
except Exception as error:
print(error)
#循环判断
if data != None:
break
else:
out_time += 2
n += 1
delay(1)
return data
def html_lxml(data, xpath):
if data != None:
node = etree.HTML(data)
item = node.xpath(xpath)
return item
else:
return None
def html_re(data, pattern, isList, num=0):
info = re.compile(pattern, re.S)
if data == None:
return None
if isList:
item = info.findall(data)
return item
else:
item = info.search(data)
if item != None:
return item.group(num)
def File_IO(file,Mode,content=None,encodel=None):
if os.path.isfile(file) == False:
return
if encodel == None:
encodel = "utf-8"
if Mode == "r" or Mode=="R":
with open(file,Mode,encoding=encodel) as rfile:
content = rfile.read()
rfile.close()
return content
else:
if content == None:
print("数据为空!")
return
with open(file,Mode,encoding=encodel) as ofile:
ofile.write(content)
ofile.close()
return
def view_source(base_url,url, Method, decodel, encodel, cnt=2):
data = None
headers = random_headers(True)
data1 = get_htmlData(base_url, "GET", decodel=decodel, headers=headers, encodel=encodel, cnt=cnt)
xpath = '//input[@id="vps_token"]/@value'
if data1 == None:
return
req1 = html_lxml(data1, xpath)
if req1 == None:
return
data_dic = {"vps_token":req1[0], "uri": url}
pre_data = get_htmlData(base_url, Method, decodel, headers, encodel, data_dic,cnt=cnt)
xpath_data = '//textarea[@id="rawcode"]/text()'
lst_data = html_lxml(pre_data, xpath_data)
try:
data = lst_data[0].replace("<", "<").replace(">", ">")
except Exception as err:
data = None
print(err,base_url)
return data
def view_source2(base_url, url,Method,decodel,encodel, cnt=2):
data = None
data_dic = {"url":url}
headers = random_headers(True)
pre_data = get_htmlData(base_url,Method,decodel,headers,encodel,data_dic,cnt=cnt)
xpath_data = '//body/pre/code/text()'
try:
data_lst = html_lxml(pre_data,xpath_data)
data = data_lst[0].replace("<","<").replace(">",">")
except Exception as err:
data = None
print(err,base_url)
return data
# https://www.codebeautify.com/URLService
def view_source3(base_url, url, Method, decodel, encodel, cnt=2):
data = None
data_dic = {"path":url}
headers1 = random_headers(True)
headers = {
# ":authority":"www.codebeautify.com",
# ":method": "POST",
# ":path":"/URLService",
# ":scheme": "https",
# "accept": "*/*",
# "accept-encoding": "gzip, deflate, br",
# "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
# "content-length": "28",
# "content-type": "application/x-www-form-urlencoded",
"origin": "https://codebeautify.org",
"referer": "https://codebeautify.org/",
# "sec-ch-ua": '"Not?A_Brand";v="8", "Chromium";v="108", "Microsoft Edge";v="108"',
# "sec-ch-ua-mobile": "?0",
# "sec-ch-ua-platform": "Windows",
# "sec-fetch-dest": "empty",
# "sec-fetch-mode": "cors",
# "sec-fetch-site": "cross-site",
"user-agent":headers1["User-Agent"]
}
data = get_htmlData(base_url, Method, decodel, headers, encodel, data_dic, cnt=cnt)
return data
def all_viewSource(url,Method,decodel,encodel,cnt=2):
base_url = ["https://www.view-page-source.com/","https://neatnik.net/view-source/","https://www.codebeautify.com/URLService"]
data = None
n = 1
while True:
n += 1
if n >cnt:
break
if Method == "POST":
num = random.random()
if num <= 0.4:
data = view_source(base_url[0],url,Method,decodel,encodel,cnt)
elif num <= 0.6:
data = view_source2(base_url[1],url,Method,decodel,encodel,cnt)
elif num <= 1.0:
data = view_source3(base_url[2],url,Method,decodel,encodel,cnt)
else:
pass
else:
headers = random_headers(False)
data = get_htmlData(url, Method, decodel, headers, encodel, cnt=cnt)
if data != None:
break
return data
利用
位于国外,那么就可以用于获取外部信息了,找一些资料就很方便了。先利用谷歌搜索信息链接,再利用该Python程序下载下来。
谷歌镜像链接分享:
https://googe.fun/
https://googlehnzyc.azurewebsites.net/
https://www.qinai.ml/
https://xn--flw351e.ml/
https://note.cm/
https://search.ahau.cf/
https://search.aust.cf/
https://search.ahau.cf/
https://gogogle.cf/
https://s.iit.xyz/
https://goo.xbzxs.org/
https://ge1.azurewebsites.net/
推特镜像:https://www.sotwe.com