天天看點

python爬取鍊家租房資訊_python爬取鍊家租房之擷取房屋的連結和頁面的詳細資訊...

因為期末考試的緣故,本打算一個星期結束的爬蟲,拖了很久,不過,也有好處:之前寫的時候總是被反爬,這幾天複習之餘寫了些反爬取的py code 下面發出來和大家探讨

做了些反爬取的手段

随機擷取一個headers

headers.py

__author__ = 'Lee'

import requests

import random #随機數子產品

def requests_headers():

head_connection = ['Keep-Alive','close']

head_accept = ['text/html,application/xhtml+xml,*/*']

head_accept_language = ['zh-CN,fr-FR;q=0.5','en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']

head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',

'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',

'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',

'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',

'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',

'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',

'Opera/9.27 (Windows NT 5.2; U; zh-cn)',

'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',

'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',

'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',

'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',

'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',

'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',

'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',

'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',

'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',

'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',

'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',

'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']

#header 為随機産生一套由上邊資訊的header檔案

header = {

'Connection':head_connection[random.randrange(0,len(head_connection))],

'Accept':head_accept[0],

'Accept-Language':head_accept_language[random.randrange(0,len(head_accept_language))],

'User-Agent':head_user_agent[random.randrange(0,len(head_user_agent))],

}

print('headers.py connection Success!')

return header #傳回值為 header這個字典

# for i in range(100): #随機産生100套資訊

# print(requests_headers()) #列印

# #print(random.randrange(1,10))

從IP池随機選擇個代理IP

ip_proxy.py

__author__ = 'Lee'

import random

ip_pool = [

'117.143.109.136:80'

]

def ip_proxy():

ip = ip_pool[random.randrange(0,len(ip_pool))]

proxy_ip = 'http://'+ip

proxies = {'http':proxy_ip}

print(proxies)

return proxies

items_combination.py

__author__ = 'Lee'

from bs4 import BeautifulSoup

import requests

import pymongo

import time

from headers import requests_headers

from ip_proxy import ip_proxy

client = pymongo.MongoClient('localhost',27017) #連結資料庫

ceshi = client['ceshi']

url_list = ceshi['url_list']

item_list = ceshi['item_info']

url_list1 = []

channel = 'https://bj.lianjia.com/zufang/dongcheng/'

#spider1 爬取房屋資訊連結并用mongodb存儲

def get_pages_url(channel,pag):

url = str(channel+'pg'+ pag)

wb_data = requests.get(url,headers=requests_headers(),proxies=ip_proxy())

soup = BeautifulSoup(wb_data.text,'lxml')

time.sleep(1)

no_data = '呣..沒有找到相關内容,請您換個條件試試吧~'

# 面包屑子產品

# 面包屑 breadcrumbs

bread_crumbs =soup.select('#house-lst > li')

item_url = soup.select('#house-lst > li > div > h2 > a')

blank_url = str(soup.find(text = no_data))

if no_data != blank_url:

for url in item_url:

url1 = url.get('href')

url_list1.append(url1)

#url_list.insert_one({'url':url1})

print(url1)

else:

pass

#get_pages_url(channel,'2')

# spider2 爬取詳細資訊并用mongodb存儲

def get_massages(url):

web_data = requests.get(url,headers=requests_headers(),proxies=ip_proxy())

soup = BeautifulSoup(web_data.text,'lxml')

title = (soup.title.text).split('|')[0] #房名

address = soup.select('div.zf-room > p > a')[0].text #位址

price = soup.select(' div.price > span.total')[0].text + '元'

area = (soup.select('div.zf-room > p ')[0].text).split(':')[-1]

home_url = url

print({'title':title ,

'address':address,

'price':price,

'area':area,

'home_url':home_url,

})

item_list.insert_one({'title':title ,

'address':address,

'price':price,

'area':area,

'home_url':home_url})

get_massages('https://bj.lianjia.com/zufang/101101635089.html')

'''

#house-lst > li > p

list-no-data clear

'''