#!/usr/bin/python
# -*- encoding:utf-8 -*-importrequests
frombs4 importBeautifulSoup
frommultiprocessing.dummy importPool asThreadPool
importre
importdatetime
importsys
# from datetime import datetimereload(sys)
sys.setdefaultencoding(‘utf-8‘)
#得到soup,因後文通用,直接放這兒就行了defurlBS(url):
response=requests.get(url)
response.encoding = ‘utf-8‘soup =
BeautifulSoup(response.text,"lxml")
returnsoup
#通過互動的方式讓搜尋人輸入想要的房屋條件,不輸的話有預設值defget_source_url():
base_url=‘http://cd.58.com/zufang/‘#首先,鎖定為整租:/zufang/,然後限定為個人房源:/0/,0為個人,1為經紀人# real_url=‘http://cd.58.com/zufang/?isreal=true‘try:
source_key=input(‘請按序号輸入你想要的房屋來源,1為不限,2為個人房源,3為經紀人(預設為2個人房源):\n‘)
except:
source_key=2
source_from={1:‘‘,2:‘0/‘,3:‘1/‘} # 4:‘?isreal=true‘,4為誠信房源專區‘try:
price_min=str(input(‘請輸入你期望的價格下限(不輸預設為500):\n‘))
except:
price_min=‘500‘try:
price_max=str(input(‘請輸入你期望的價格上限(不輸預設為1000):\n‘))
except:
price_max=‘1000‘price=‘minprice=‘+price_min+‘_‘+price_max
try:
room_key=input(‘請輸入你想要的房子間數:0為不限,1為1室,2為2室,3為3室,4為4室,5為4室以上(不輸預設為1室):\n‘)
except:
room_key=1
room_num={0:‘‘,1:‘j1/‘,2:‘j2/‘,3:‘j3/‘,4:‘j4/‘,5:‘j5/‘}
key_words=raw_input(‘請輸入你想要搜尋的其他關鍵詞,如小區名稱,地鐵位置等(不輸預設為空):\n‘)
source_url=base_url+source_from[source_key]+room_num[room_key]+‘?‘+price+‘&key=‘+key_words
# print new_urlreturnsource_url
#
new_url=‘http://cd.58.com/zufang/0/j1/?minprice=600_800&PGTID=0d300008-0006-6cd9-6ba7-a7672ec996c3&ClickID=3‘
#找到下一頁的位址,因為58的網站很坑,它并沒有顯示共多少頁,是以隻能通過爬取他的下一頁對應的href來得到下一頁的連結#但是,更坑的是,他的頁面進去後第一次的下一頁按了後和目前頁是一樣的,是以我就在确定有下一頁的情況下,直接用目前頁+1得到下一頁的urldefget_new_list(source_url):
new_url=source_url
new_url_list=[new_url]
whileTrue:
soup=urlBS(new_url)
cp=re.compile(r‘/pn(.)/‘)
finder=soup.find(‘a‘,{‘class‘:‘next‘})
iffinder:
next_url=finder[‘href‘]
now_page=cp.findall(source_url)
next_page=‘http://cd.58.com‘+next_url
ifnow_page:
now_page=now_page[0]
newpage=str(int(now_page)+1)
new_page=cp.sub(newpage,next_page)
else:
now_page=‘1‘newpage=‘2‘new_page=‘http://cd.58.com‘+next_url
new_url=new_page
else:
new_page=‘‘break# else:
# print ‘dont have next page‘
# print new_urliflen(new_url_list)==1:
new_url_list.append(new_url)
elifnew_page==new_url_list[-1]:
break
else:
new_url_list.append(new_url)
new_url_listreturnnew_url_list
#得到房屋資訊頁的連結defget_house_url(new_url):
soup = urlBS(new_url)
href_list=soup.select(‘div[class="img_list"] a‘)
house_url_list=[]
foreach inhref_list:
href=each[‘href‘]
#print hrefhouse_url_list.append(href)
returnhouse_url_list
#爬取房屋資訊,同時不要那些騙子的資訊,以及一個月前更新的資訊defhouse_info(house_url):
#
house_url=‘http://cd.58.com/zufang/26364594760504x.shtml?version=A&psid=162154127192148068945806804&entinfo=26364594760504_0‘
#
print house_urlsoup=urlBS(house_url)
try:
tel=soup.find(‘span‘,{‘class‘:‘tel-num tel-num-geren pl30 f30‘}).text #個人房源except:
tel=soup.find(‘span‘,{‘class‘:‘tel-num pl30 f30‘}).text #中介match_tel=re.search(r‘^1\d{5}.*‘,tel) #排除所有電話号碼以0開始的人,即留固定電話的人,因為我們認為,固定房源的人是不會留固定電話的situation=soup.find(‘div‘,{‘class‘:‘description-content‘}).text.strip()
# print situationmatch_si=re.search(u‘(我是房東|男士勿擾|男生勿擾|限女生|微信|男士|男性|男生|女性|女的|姐妹|"+")‘,situation)
#更新時間update_time=soup.find(‘span‘,{‘class‘:‘pl10‘}).text
update_date = datetime.datetime.strptime(update_time.replace(‘更新時間:‘,‘‘), "%Y-%m-%d").date()
thirtyDayAgo=datetime.date.today() +
datetime.timedelta(days=-30)
day_line=(update_date -
thirtyDayAgo).days
if notmatch_tel: #認為隐藏了電話号碼的,電話号碼以0開始的,都是騙子,不要他# print ‘電話号碼有問題,騙子‘pass
elifmatch_si: #認為含有某些字的全部為騙子,把這些排除掉# print ‘内容有問題,騙子‘pass
elifday_line<0: #取近30天更新的資料,時間太長了的資料沒啥意義# print ‘已經是一個月之前的消息了‘pass
else:
printhouse_url
printsituation
printtel
#标題title=soup.find(‘h1‘,{‘class‘:‘main-title
font-heiti‘}).text
printtitle
#價格p=re.compile(r‘\n|\t|\r| ‘)
rent_price=soup.find(‘i‘,{‘class‘:‘ncolor‘}).text
price=p.sub(‘‘,rent_price)
printprice
#房屋大小house=soup.find_all(‘li‘,{‘class‘:‘house-primary-content-li
clearfix‘})
house_content=p.sub(‘‘,house[0].text)
printhouse_content
#小區try:
house_Community=p.sub(‘‘,house[1].text)
except:
house_Community=‘‘printhouse_Community
#位置try:
house_place=p.sub(‘‘,house[2].text)
except:
house_place=‘‘printhouse_place
#設施try:
facility=soup.find(‘li‘,{‘class‘:‘house-primary-content-li
clearfix person-config‘})
facility=p.sub(‘‘,facility.text)
except:
facility=‘‘printfacility
#聯系人contact=soup.find(‘li‘,{‘class‘:‘house-primary-content-li
clearfix person-contact‘}).text
contact=p.sub(‘‘,contact)
printcontact
printupdate_time+‘\n\n\n‘#
a=[house_url,price,house_content,house_Community,house_place,title,situation,facility]f.write(‘----------------------------------------------------------------------------------\n‘)
f.write(house_url+‘\n‘+price+‘\n‘+house_content+‘\n‘+house_Community+‘\n‘+house_place+‘\n‘+title+‘\n‘+situation+‘\n‘+facility+‘\n\n‘)
if__name__==‘__main__‘:
source_url=get_source_url()
printsource_url
#
source_url=‘http://cd.58.com/zufang/0/?minprice=500_1500&key=四河‘get_new_list=get_new_list(source_url)
get_new_listf=open("house_rent.txt", "w")
#先清空,然後再打開,再寫入,寫入時的方式是a(追加)
# f.truncate()
# f.close()
#
# f=open("house_rent.text",
"a")print‘正在下載下傳,請稍候。。。\n\n‘# pool =
ThreadPool(4)fornew_url inget_new_list:
new_url=new_url.encode(‘utf-8‘).decode(‘utf-8‘)
# print new_urlhouse_url_list=get_house_url(new_url)
house_url_listforeach inhouse_url_list: #本來打算使用多線程,但是總是會報:‘module‘ object has no attribute ‘_strptime‘這個奇怪的錯誤,掙紮了許久,放棄house_info(each)
# results = pool.map(house_info,
house_url_list)
# pool.close()
# pool.join()f.close()