title: Python判斷網頁編碼
copyright: true
top: 0
date: 2018-05-04 17:37:58
tags: 基礎
categories: 爬蟲筆記
permalink:
password:
keywords: [Python爬蟲,編碼]
description: 做爬蟲或者測試的時候,傳回網頁的編碼是個很讓人頭疼的問題,這篇文章做個網頁編碼問題解決方法總結。
有一種渴,隻有酒才能滋潤,這種渴就是孤獨。
根據網頁傳回編碼尋找資料
比如我要找到這個網頁的标題,那麼直接正則比對
(.*?)就可以,但是許多時候因為編碼問題requests這個庫沒辦法正确解析,是以擷取不到資料。
解決辦法:
r_port_top = requests.get(url=str('http://'+url), headers=headers, timeout=5)
if r_port_top.encoding == 'ISO-8859-1':
encodings = requests.utils.get_encodings_from_content(r_port_top.text)
if encodings:
encoding = encodings[0]
else:
encoding = r_port_top.apparent_encoding
encode_content = r_port_top.content.decode(encoding, 'replace').encode('utf-8', 'replace')
port_title = re.search('<title>(.*?)</title>', encode_content, re.S).group().replace('<title>',
'').replace(
'</title>', '')
這種辦法就是先判斷網頁的編碼,然後轉換之。但是有的時候是utf-8編碼就沒辦法,接下來來個終極版的。
try:
UA = random.choice(headerss)
headers = {'User-Agent': UA}
r_port_top = requests.get(url=str('http://'+url), headers=headers, timeout=5)
if r_port_top.encoding == 'ISO-8859-1':
encodings = requests.utils.get_encodings_from_content(r_port_top.text)
if encodings:
encoding = encodings[0]
else:
encoding = r_port_top.apparent_encoding
encode_content = r_port_top.content.decode(encoding, 'replace').encode('utf-8', 'replace')
port_title = re.search('<title>(.*?)</title>', encode_content, re.S).group().replace('<title>',
'').replace(
'</title>', '')
elif r_port_top.encoding == 'GB2312':
encodings = requests.utils.get_encodings_from_content(r_port_top.text)
if encodings:
encoding = encodings[0]
else:
encoding = r_port_top.apparent_encoding
encode_content = r_port_top.content.decode(encoding, 'replace').encode('utf-8', 'replace')
port_title = re.search('<title>(.*?)</title>', encode_content, re.S).group().replace('<title>',
'').replace(
'</title>', '')
elif r_port_top.encoding == 'gb2312':
encodings = requests.utils.get_encodings_from_content(r_port_top.text)
if encodings:
encoding = encodings[0]
else:
encoding = r_port_top.apparent_encoding
encode_content = r_port_top.content.decode(encoding, 'replace').encode('utf-8', 'replace')
port_title = re.search('<title>(.*?)</title>', encode_content, re.S).group().replace('<title>',
'').replace(
'</title>', '')
elif r_port_top.encoding == 'GBK':
encodings = requests.utils.get_encodings_from_content(r_port_top.text)
if encodings:
encoding = encodings[0]
else:
encoding = r_port_top.apparent_encoding
encode_content = r_port_top.content.decode(encoding, 'replace').encode('utf-8', 'replace')
port_title = re.search('<title>(.*?)</title>', encode_content, re.S).group().replace('<title>',
'').replace(
'</title>', '')
elif r_port_top.encoding == 'gbk':
encodings = requests.utils.get_encodings_from_content(r_port_top.text)
if encodings:
encoding = encodings[0]
else:
encoding = r_port_top.apparent_encoding
encode_content = r_port_top.content.decode(encoding, 'replace').encode('utf-8', 'replace')
port_title = re.search('<title>(.*?)</title>', encode_content, re.S).group().replace('<title>',
'').replace(
'</title>', '')
else:
port_title = re.search('<title>(.*?)</title>', r_port_top.content, re.S).group().replace('<title>',
'').replace(
'</title>', '')
except:
try:
port_title = re.search('<title>(.*?)</title>', r_port_top.content, re.S).group().replace('<title>',
'').replace(
'</title>', '')
except:
port_title = '暫時無法擷取網站标題'
使用chardet直接判斷轉換
上面那個方法實在是太傻了,使用chardet輕松解決網頁編碼問題。
# -*- coding: utf-8 -*-
# @Time : 2018/5/4 0004 8:55
# @Author : Langzi
# @Blog : www.langzi.fun
# @File : get urls.py
# @Software: PyCharm
import sys
import chardet
import re
import requests
reload(sys)
sys.setdefaultencoding('utf-8')
url = 'https://stackoverflow.com'
d1 = requests.get(url)
print d1.content
if isinstance(d1.content,unicode):
pass
else:
codesty = chardet.detect(d1.content)
a = d1.content.decode(codesty['encoding'])
得到的a就是網頁最終編碼後的結果,這個時候直接re.search(’
(.*?)’,a)就可以達到了比對所有網址的标題了。
當然更簡單的方式
requests自帶的一個api可以快速識别網頁編碼,然後轉換成utf-8編碼
import requests
url = 'http://www.langzi.fun'
r = requests.get(url)
encoing = requests.utils.get_encodings_from_content(r.text)[0]
print(encoing)
res = r.content.decode(encoing,'replace')
# 替換其中異常的編碼,這個相對來可能一眼就知道那些字元編碼出問題了。
res = r.content.decode(encoing,'ignore')
# 忽略其中有異常的編碼,僅顯示有效的編碼
通過檢視該api的源碼,得知它實作的原理是用正規表達式擷取到網頁中的編碼
def get_encodings_from_content(content):
charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
return (charset_re.findall(content) +
pragma_re.findall(content) +
xml_re.findall(content))
知道原理後,就可以把這個函數拿來移植到自己的功能函數中,也是學到了噢~
擷取網頁資訊
使用chardet庫來進行編碼判斷
如果想要擷取網頁的标題和内容以及網頁中的外鍊,寫了一個類來實作。使用方法如下
d = Get_Info(url='http://www.langzi.fun')
d1 = d.get_urls()
# 傳回這個傳入網址中所有的外鍊,傳回對象為清單,如果沒有資料傳回None,下同
d2 = d.get_infos()
# 傳回這個網址中的标題,内容,傳回對象為字典
d3 = d.get_ips()
# 傳回這個網址的ip和開放端口,傳回對象為字典
具體代碼如下:
# coding:utf-8
import re
import requests
import time
import socket
from bs4 import BeautifulSoup as bs
import chardet
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
timeout = 3
socket.setdefaulttimeout(timeout)
from requests.packages import urllib3
urllib3.disable_warnings()
ports = [
21,
22,
23,
25,
53,
69,
139,
445,
389,
1433,
1521,
2181,
3306,
3389,
5432,
5984,
6379,
7001,
7002,
8069,
11211,
27017,
27018,
50070,
50030
]
class Get_Info:
def __init__(self,url):
self.url = url
def get_ips(self):
url_port = []
url_port.append(80)
hostname = self.url.replace('http://','').replace('https://','').replace('/','')
url_ip = 'None'
try:
url_ip= socket.gethostbyname(str(hostname))
except:
pass
if url_ip and url_ip!= 'None':
for port in ports:
s = socket.socket()
try:
s.connect((url_ip,port))
url_port.append(port)
except Exception,e:
# print e
pass
finally:
s.close()
if url_ip and url_ip != 'None':
infos = {}
infos['ip'] = str(url_ip)
infos['ports'] = str(url_port)
return infos
else:
return None
def get_infos(self):
try:
headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
r = requests.get(url=self.url,headers=headers,verify=False,timeout=5)
url_title,url_content,url_service = '擷取失敗','擷取失敗','擷取失敗'
try:
code = chardet.detect(r.content)['encoding']
bp = bs(r.content.decode(code).encode('utf-8'),'html.parser')
url_title = bp.title.string
url_content = bp.text
url_service = r.headers
except:
url_title = re.search('<title>(.*?)</title>',r.content,re.I).group(1).decode(code).encode('utf-8')
url_content = re.sub('([\.\?\*~!@#{$%\^&\*()-;"<>\[\]}_\+=]|[0-9]|[a-z]|[A-Z])','',r.text)
url_service = r.headers
infos = {}
infos['url'] = r.url
infos['title'] = url_title
url_contents = ''.join(r.text.split()).replace(' ','')
infos['content'] = re.sub('([\.\?\*~!@#{$%\^&\*()-;"<>\[\]}_\+=]|[0-9]|[a-z]|[A-Z])','',url_contents).replace('|','').replace("'",'')
infos['service'] = url_service
if infos:
return infos
else:
return None
except Exception,e:
print e
def get_urls(self):
urlss = []
headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
try:
r = requests.get(url=self.url, headers=headers, verify=False, timeout=5)
pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',re.I)
urls = re.findall(pattern,r.content)
for x in urls:
a1, a2 = x.split('//')[0], x.split('//')[1].split('/')[0]
a3 = ''.join(a1) + '//' + ''.join(a2)
urlss.append(a3.replace("'","").replace('>','').replace('<',''))
if urlss:
return list(set(urlss))
else:
return None
except Exception,e:
print e
pass
個人部落格:www.langzi.fun
歡迎交流Python開發,安全測試。
歡迎關注公衆号:【安全研發】擷取更多相關工具,課程,資料分享哦~