目錄:
urlencode & quote & unquote (url 中帶中文參數)
python httplib urllib urllib2差別(一撇)
python post請求執行個體 & json -- str互相轉化(application/x-www-form-urlencoded \ multipart/form-data)
1, 前言:
python提供很多種非常友好的通路網頁内容的方法,python2.x : 如 python的httplib、urllib和urllib2 ; python3.x 又提供了request的方法。同時,每種方法下面又分為:get post put delete 等method..
一時間江湖上充斥着“五門八派”的各種,令初學者眼花缭亂,不知如何下手,如何學起。
但是,有一點需要提醒的是:無論哪一種方案或方法,存在既有其合理性,用着哪一種方法上手;得心應手才是王道!!!
2, 下面我們比較一下python2.x 中的三種方法,先上執行個體,之後分析
(1)執行個體
import json
import sys
import hashlib
import urllib
import httplib
### none using now
def generate_json_list():
reload(sys)
sys.setdefaultencoding('gbk')
print "[",
flag=False
for line in sys.stdin:
if flag:
print ",",
else:
flag=True
line=line.strip()
items=line.split("\t")
out={"key":"","createdAt":"","word":"","channel":"","type":"","scale":""}
out["createdAt"]=items[0]
out["scale"]=items[1]
out["channel"]=items[2]
out["word"]=items[3]
print json.dumps(out,encoding="gbk").decode("unicode-escape"),
print "]"
import urllib2
def import_out_hotwords(key, json_str, out):
HOST = "http://10.129.232.109:5005/api/externalHotWords/insertSingle"
#HOST = "http://10.129.232.109:5005/api/externalHotWords/insertSin"
#print "2--", json_str
value={"configKey":key,"configValue":json_str}
data=urllib.urlencode(value)
print >> sys.stderr, "### 3params", value, data
req = urllib2.Request(HOST, data)
req.add_header("content-type", "application/x-www-form-urlencoded")
req.get_method = lambda : 'PUT'
response = None
try:
response = urllib2.urlopen(req, timeout=5)
if response.code == 200:
print "insertSingle Succ: ", out["word"], out["channel"], out["key"]
response.close()
except urllib2.URLError as e:
if hasattr(e, 'code'):
print 'Error code:',e.code
elif hasattr(e, 'reason'):
print 'Reason:',e.reason
finally:
if response:
response.close()
def import_out_hotwords_2(key, json_str, out):
HOST = "http://10.129.232.109:5005/api/externalHotWords/insertSingle"
#HOST = "http://10.129.232.109:5005/api/externalHotWords/insertSin"
#print "2--", json_str
value={"configKey":key,"configValue":json_str}
data=urllib.urlencode(value)
print >> sys.stderr, "## 2params", value, data
req = urllib2.Request(HOST, data)
req.add_header("content-type", "application/x-www-form-urlencoded")
req.get_method = lambda : 'PUT'
response = None
try:
response = urllib2.urlopen(req, timeout=5)
if response.code == 200:
print "insertSingle Succ: ", out["word"], out["channel"], out["key"]
response.close()
except urllib2.URLError as e:
if hasattr(e, 'code'):
print 'Error code:',e.code
elif hasattr(e, 'reason'):
print 'Reason:',e.reason
finally:
if response:
response.close()
def import_out_hotwords_old(key, json_str, out):
HOST = "10.129.232.109:5005"
conn = httplib.HTTPConnection(HOST)
#print "2--", json_str
value={"configKey":key,"configValue":json_str}
data=urllib.urlencode(value)
#print data
headers = {
'content-type': 'application/x-www-form-urlencoded',
'cache-control': 'no-cache'
}
conn.request("PUT", "/api/externalHotWords/insertSingle", body=data, headers=headers)
handler = conn.getresponse()
if handler.status == 200:
print "insertSingle Succ: ", out["word"], out["channel"], out["key"]
#if handler.read().decode('utf8').encode('gbk')[0] == "OK":
# print "insertSingle Succ: ", json_str
conn.close()
def generate_json():
reload(sys)
sys.setdefaultencoding('gbk')
for line in sys.stdin:
line=line.strip()
items=line.split("\t")
if len(items) < 4:
continue
out={"key":"","createdAt":"","word":"","channel":"","type":"","scale":""}
out["createdAt"]=items[0]
#out["scale"]=items[1]
out["channel"]=items[2]
out["word"]=items[3]
key = hashlib.md5((items[3] + items[2])).hexdigest()
key = "externalHotWords_" + key
out["key"] = key
json_str = json.dumps(out,encoding="gbk")#.decode("unicode-escape")
#import_out_hotwords(key, urllib.quote(json_str.decode('gbk', 'ignore').encode('utf8')), out)
import_out_hotwords_2(key, json_str, out)
def generate_json_old():
reload(sys)
sys.setdefaultencoding('gbk')
for line in sys.stdin:
line=line.strip()
items=line.split("\t")
if len(items) < 4:
continue
out={"key":"","createdAt":"","word":"","channel":"","type":"","scale":""}
out["createdAt"]=items[0]
#out["scale"]=items[1]
out["channel"]=items[2]
out["word"]=items[3]
key = hashlib.md5((items[3] + items[2])).hexdigest()
out["key"] = "externalHotWords_" + key
json_str = json.dumps(out,encoding="gbk").decode("unicode-escape")
#json_str = out
#print "1--", json_str
## return 'req=' + urllib.quote(reqinfo.decode('gbk', 'ignore').encode('utf8'))
import_out_hotwords(key, urllib.quote(json_str.decode('gbk', 'ignore').encode('utf8')), out)
#import_out_hotwords(key, json_str)
if __name__=="__main__":
#generate_json_list()
generate_json()
下面的執行個體存在一個小問題:二次編碼問題,首先對out進行json.dumps() 的json_str轉化(正确),之後對json_str進行urllib.quote() (第一次編碼);最後在
value={"configKey":key,"configValue":json_str} 之後有urllib.urlencode() (第二次編碼)
格式一:configValue=%7B%27scale%27%3A+%27%27%2C+%27word%27%3A+%27%5Cxb2%5Cxe2%5Cxca%
5Cxd4soso%27%2C+%27channel%27%3A+%27360_%5Cxca%5Cxb5%5Cxca%5Cxb1%5Cxc8%5Cxc8%5Cxb5%5Cxe3%27%2C+%27key%27%3A+%27externalHotWords_ed9f4ea3b7ff116c67366f7a576bcb08%27%2C+%27type%
27%3A+%27%27%2C+%27createdAt%27%3A+%272017-06-07+11%3A22%3A32%27%7D&configKey=ed9f4ea3b7ff116c67366f7a576bcb08
格式二:configValue=%257B%2522scale%2522%253A%2520%2522%2522%252C%2520%2522word%2522%253A%2520%2522%25E6%25B5%258B%25E8%25AF%2595soso%2522%2
52C%2520%2522channel%2522%253A%2520%2522360_%25E5%25AE%259E%25E6%2597%25B6%25E7%2583%25AD%25E7%2582%25B9%2522%252C%2520%2522key%2522%253A%2520%2522externalHotWords_ed9f4ea3b7f
f116c67366f7a576bcb08%2522%252C%2520%2522type%2522%253A%2520%2522%2522%252C%2520%2522createdAt%2522%253A%2520%25222017-06-07%252011%253A22%253A32%2522%257D&configKey=ed9f4ea3b
7ff116c67366f7a576bcb08
顯然格式二是對格式一再次進行了編碼(因為{ --> %7B; % --> %25; )
import json
import sys
import hashlib
import urllib
import httplib
### none using now
def generate_json_list():
reload(sys)
sys.setdefaultencoding('gbk')
print "[",
flag=False
for line in sys.stdin:
if flag:
print ",",
else:
flag=True
line=line.strip()
items=line.split("\t")
out={"key":"","createdAt":"","word":"","channel":"","type":"","scale":""}
out["createdAt"]=items[0]
out["scale"]=items[1]
out["channel"]=items[2]
out["word"]=items[3]
print json.dumps(out,encoding="gbk").decode("unicode-escape"),
print "]"
import urllib2
def import_out_hotwords(key, json_str, out):
HOST = "http://10.129.232.109:5005/api/externalHotWords/insertSingle"
#HOST = "http://10.129.232.109:5005/api/externalHotWords/insertSin"
#print "2--", json_str
value={"configKey":key,"configValue":json_str}
data=urllib.urlencode(value)
req = urllib2.Request(HOST, data)
req.add_header("content-type", "application/x-www-form-urlencoded")
req.get_method = lambda : 'PUT'
response = None
try:
response = urllib2.urlopen(req, timeout=5)
if response.code == 200:
print "insertSingle Succ: ", out["word"], out["channel"], out["key"]
response.close()
except urllib2.URLError as e:
if hasattr(e, 'code'):
print 'Error code:',e.code
elif hasattr(e, 'reason'):
print 'Reason:',e.reason
finally:
if response:
response.close()
def import_out_hotwords_old(key, json_str, out):
HOST = "10.129.232.109:5005"
conn = httplib.HTTPConnection(HOST)
#print "2--", json_str
value={"configKey":key,"configValue":json_str}
data=urllib.urlencode(value)
#print data
headers = {
'content-type': 'application/x-www-form-urlencoded',
'cache-control': 'no-cache'
}
conn.request("PUT", "/api/externalHotWords/insertSingle", body=data, headers=headers)
handler = conn.getresponse()
if handler.status == 200:
print "insertSingle Succ: ", out["word"], out["channel"], out["key"]
#if handler.read().decode('utf8').encode('gbk')[0] == "OK":
# print "insertSingle Succ: ", json_str
conn.close()
def generate_json():
reload(sys)
sys.setdefaultencoding('gbk')
for line in sys.stdin:
line=line.strip()
items=line.split("\t")
if len(items) < 4:
continue
out={"key":"","createdAt":"","word":"","channel":"","type":"","scale":""}
out["createdAt"]=items[0]
#out["scale"]=items[1]
out["channel"]=items[2]
out["word"]=items[3]
key = hashlib.md5((items[3] + items[2])).hexdigest()
out["key"] = "externalHotWords_" + key
json_str = json.dumps(out,encoding="gbk").decode("unicode-escape")
#json_str = out
#print "1--", json_str
## return 'req=' + urllib.quote(reqinfo.decode('gbk', 'ignore').encode('utf8'))
import_out_hotwords(key, urllib.quote(json_str.decode('gbk', 'ignore').encode('utf8')), out)
#import_out_hotwords(key, json_str)
if __name__=="__main__":
#generate_json_list()
generate_json()cat
CMD: cat tmp | python generate_json2.py
[@10.134.105.160 HotRankingLoggers]# vi tmp
2017-06-07 11:22:32 6964 360_實時熱點 測試APP
2017-06-07 11:22:32 6498 360_實時熱點 測試soso
(2)分析(參考 python的httplib、urllib和urllib2的差別及用 )
urllib和urllib2
urllib 和urllib2都是接受URL請求的相關子產品,但是urllib2可以接受一個Request類的執行個體來設定URL請求的headers,urllib僅可以接受URL。
這意味着,你不可以僞裝你的User Agent字元串等。
urllib提供urlencode方法用來GET查詢字元串的産生,而urllib2沒有。這是為何urllib常和urllib2一起使用的原因。
目前的大部分http請求都是通過urllib2來通路的
httplib
httplib實作了HTTP和HTTPS的用戶端協定,一般不直接使用,在python更高層的封裝子產品中(urllib,urllib2)使用了它的http實作。
(3)詳解
urllib簡單用法
1. google = urllib.urlopen('http://www.google.com')
2. print 'http header:/n', google.info()
3. print 'http status:', google.getcode()
4. print 'url:', google.geturl()
5. for line in google: # 就像在操作本地檔案
6. print line,
7. google.close()
urllib2簡單用法
1. import urllib2
2. response=urllib2.urlopen('http://www.douban.com')
3. html=response.read()
實際步驟:
1、urllib2.Request()的功能是構造一個請求資訊,傳回的req就是一個構造好的請求
2、urllib2.urlopen()的功能是發送剛剛構造好的請求req,并傳回一個檔案類的對象response,包括了所有的傳回資訊。
3、通過response.read()可以讀取到response裡面的html,通過response.info()可以讀到一些額外的資訊。如下:
1. #!/usr/bin/env python
2. import urllib2
3. req = urllib2.Request("http://www.douban.com")
4. response = urllib2.urlopen(req)
5. html = response.read()
6. print html
有時你會碰到,程式也對,但是伺服器拒絕你的通路。這是為什麼呢?問題出在請求中的頭資訊(header)。 有的服務端有潔癖,不喜歡程式來觸摸它。這個時候你需要将你的程式僞裝成浏覽器來送出請求。請求的方式就包含在header中。常見的情形:
1. import urllib
2. import urllib2
3. url = 'http://www.someserver.com/cgi-bin/register.cgi'
4. user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'# 将user_agent寫入頭資訊
5. values = {'name' : 'who','password':'123456'}
6. headers = { 'User-Agent' : user_agent }
7. data = urllib.urlencode(values)
8. req = urllib2.Request(url, data, headers)
9. response = urllib2.urlopen(req)
10. the_page = response.read()
values是post資料
GET方法
例如百度:
百度是通過http://www.baidu.com/s?wd=XXX 來進行查詢的,這樣我們需要将{‘wd’:’xxx’}這個字典進行urlencode
1. #coding:utf-8
2. import urllib
3. import urllib2
4. url = 'http://www.baidu.com/s'
5. values = {'wd':'D_in'}
6. data = urllib.urlencode(values)
7. print data
8. url2 = url+'?'+data
9. response = urllib2.urlopen(url2)
10. the_page = response.read()
11. print the_page
POST方法
1. import urllib
2. import urllib2
3. url = 'http://www.someserver.com/cgi-bin/register.cgi'
4. user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' //将user_agent寫入頭資訊
5. values = {'name' : 'who','password':'123456'} //post資料
6. headers = { 'User-Agent' : user_agent }
7. data = urllib.urlencode(values) //對post資料進行url編碼
8. req = urllib2.Request(url, data, headers)
9. response = urllib2.urlopen(req)
10. the_page = response.read()
urllib2帶cookie的使用
1. #coding:utf-8
2. import urllib2,urllib
3. import cookielib
4.
5. url = r'http://www.renren.com/ajaxLogin'
6.
7. #建立一個cj的cookie的容器
8. cj = cookielib.CookieJar()
9. opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
10. #将要POST出去的資料進行編碼
11. data = urllib.urlencode({"email":email,"password":pass})
12. r = opener.open(url,data)
13. print cj
httplib簡單用法
1. #!/usr/bin/env python
2. # -*- coding: utf-8 -*-
3. import httplib
4. import urllib
5.
6. def sendhttp():
7. data = urllib.urlencode({'@number': 12524, '@type': 'issue', '@action': 'show'})
8. headers = {"Content-type": "application/x-www-form-urlencoded",
9. "Accept": "text/plain"}
10. conn = httplib.HTTPConnection('bugs.python.org')
11. conn.request('POST', '/', data, headers)
12. httpres = conn.getresponse()
13. print httpres.status
14. print httpres.reason
15. print httpres.read()
16.
17. if __name__ == '__main__':
18. sendhttp()
3,get put post delete 方法,參考自 python urllib2對http的get,put,post,delete)
#GET:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import urllib2
def get():
URL ='www.baidu.com' #頁面的位址
response =urllib2.urlopen(URL) #調用urllib2向伺服器發送get請求
returnresponse.read() #擷取伺服器傳回的頁面資訊
#POST:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import urllib
import urllib2
def post():
URL ='http://umbra.nascom.nasa.gov/cgi-bin/eit-catalog.cgi' #頁面的位址
values ={'obs_year':'2011','obs_month':'March', #post的值
'obs_day':'8','start_year':'2011'
,'start_month':'March','start_day':'8'
,'start_hour':'All Hours','stop_year':'2011'
,'stop_month':'March','stop_day':'8'
,'stop_hour':'All Hours','xsize':'All'
,'ysize':'All','wave':'all'
,'filter':'all','object':'all'
,'xbin':'all','ybin':'all'
,'highc':'all'}
data =urllib.urlencode(values) #适用urllib對資料進行格式化編碼
printdata #輸出檢視編碼後的資料格式
req =urllib2.Request(URL, data) #生成頁面請求的完整資料
response =urllib2.urlopen(req) #發送頁面請求
returnresponse.read() #擷取伺服器傳回的頁面資訊
#PUT
import urllib2
request = urllib2.Request('http://example.org',data='your_put_data')
request.add_header('Content-Type', 'your/contenttype')
request.get_method = lambda: 'PUT'
response = urllib2.urlopen(request)
#DELETE
import urllib2
request = urllib2.Request(uri)
request.get_method = lambda: 'DELETE'
response = urllib2.urlopen(request)