天天看點

python爬取58同城租房資訊_python爬蟲:找房助手V1.0-爬取58同城租房資訊(示例代碼)...

#!/usr/bin/python

# -*- encoding:utf-8 -*-importrequests

frombs4 importBeautifulSoup

frommultiprocessing.dummy importPool asThreadPool

importre

importdatetime

importsys

# from datetime import datetimereload(sys)

sys.setdefaultencoding(‘utf-8‘)

#得到soup,因後文通用,直接放這兒就行了defurlBS(url):

response=requests.get(url)

response.encoding = ‘utf-8‘soup =

BeautifulSoup(response.text,"lxml")

returnsoup

#通過互動的方式讓搜尋人輸入想要的房屋條件,不輸的話有預設值defget_source_url():

base_url=‘http://cd.58.com/zufang/‘#首先,鎖定為整租:/zufang/,然後限定為個人房源:/0/,0為個人,1為經紀人# real_url=‘http://cd.58.com/zufang/?isreal=true‘try:

source_key=input(‘請按序号輸入你想要的房屋來源,1為不限,2為個人房源,3為經紀人(預設為2個人房源):\n‘)

except:

source_key=2

source_from={1:‘‘,2:‘0/‘,3:‘1/‘}    # 4:‘?isreal=true‘,4為誠信房源專區‘try:

price_min=str(input(‘請輸入你期望的價格下限(不輸預設為500):\n‘))

except:

price_min=‘500‘try:

price_max=str(input(‘請輸入你期望的價格上限(不輸預設為1000):\n‘))

except:

price_max=‘1000‘price=‘minprice=‘+price_min+‘_‘+price_max

try:

room_key=input(‘請輸入你想要的房子間數:0為不限,1為1室,2為2室,3為3室,4為4室,5為4室以上(不輸預設為1室):\n‘)

except:

room_key=1

room_num={0:‘‘,1:‘j1/‘,2:‘j2/‘,3:‘j3/‘,4:‘j4/‘,5:‘j5/‘}

key_words=raw_input(‘請輸入你想要搜尋的其他關鍵詞,如小區名稱,地鐵位置等(不輸預設為空):\n‘)

source_url=base_url+source_from[source_key]+room_num[room_key]+‘?‘+price+‘&key=‘+key_words

# print new_urlreturnsource_url

#

new_url=‘http://cd.58.com/zufang/0/j1/?minprice=600_800&PGTID=0d300008-0006-6cd9-6ba7-a7672ec996c3&ClickID=3‘

#找到下一頁的位址,因為58的網站很坑,它并沒有顯示共多少頁,是以隻能通過爬取他的下一頁對應的href來得到下一頁的連結#但是,更坑的是,他的頁面進去後第一次的下一頁按了後和目前頁是一樣的,是以我就在确定有下一頁的情況下,直接用目前頁+1得到下一頁的urldefget_new_list(source_url):

new_url=source_url

new_url_list=[new_url]

whileTrue:

soup=urlBS(new_url)

cp=re.compile(r‘/pn(.)/‘)

finder=soup.find(‘a‘,{‘class‘:‘next‘})

iffinder:

next_url=finder[‘href‘]

now_page=cp.findall(source_url)

next_page=‘http://cd.58.com‘+next_url

ifnow_page:

now_page=now_page[0]

newpage=str(int(now_page)+1)

new_page=cp.sub(newpage,next_page)

else:

now_page=‘1‘newpage=‘2‘new_page=‘http://cd.58.com‘+next_url

new_url=new_page

else:

new_page=‘‘break# else:

#     print ‘dont have next page‘

# print new_urliflen(new_url_list)==1:

new_url_list.append(new_url)

elifnew_page==new_url_list[-1]:

break

else:

new_url_list.append(new_url)

# print

new_url_listreturnnew_url_list

#得到房屋資訊頁的連結defget_house_url(new_url):

soup = urlBS(new_url)

href_list=soup.select(‘div[class="img_list"] a‘)

house_url_list=[]

foreach inhref_list:

href=each[‘href‘]

#print hrefhouse_url_list.append(href)

returnhouse_url_list

#爬取房屋資訊,同時不要那些騙子的資訊,以及一個月前更新的資訊defhouse_info(house_url):

#

house_url=‘http://cd.58.com/zufang/26364594760504x.shtml?version=A&psid=162154127192148068945806804&entinfo=26364594760504_0‘

#

print house_urlsoup=urlBS(house_url)

try:

tel=soup.find(‘span‘,{‘class‘:‘tel-num tel-num-geren pl30 f30‘}).text       #個人房源except:

tel=soup.find(‘span‘,{‘class‘:‘tel-num pl30 f30‘}).text                     #中介match_tel=re.search(r‘^1\d{5}.*‘,tel) #排除所有電話号碼以0開始的人,即留固定電話的人,因為我們認為,固定房源的人是不會留固定電話的situation=soup.find(‘div‘,{‘class‘:‘description-content‘}).text.strip()

# print situationmatch_si=re.search(u‘(我是房東|男士勿擾|男生勿擾|限女生|微信|男士|男性|男生|女性|女的|姐妹|"+")‘,situation)

#更新時間update_time=soup.find(‘span‘,{‘class‘:‘pl10‘}).text

update_date = datetime.datetime.strptime(update_time.replace(‘更新時間:‘,‘‘), "%Y-%m-%d").date()

thirtyDayAgo=datetime.date.today() +

datetime.timedelta(days=-30)

day_line=(update_date -

thirtyDayAgo).days

if notmatch_tel:   #認為隐藏了電話号碼的,電話号碼以0開始的,都是騙子,不要他# print ‘電話号碼有問題,騙子‘pass

elifmatch_si:      #認為含有某些字的全部為騙子,把這些排除掉# print ‘内容有問題,騙子‘pass

elifday_line<0:    #取近30天更新的資料,時間太長了的資料沒啥意義# print ‘已經是一個月之前的消息了‘pass

else:

printhouse_url

printsituation

printtel

#标題title=soup.find(‘h1‘,{‘class‘:‘main-title

font-heiti‘}).text

printtitle

#價格p=re.compile(r‘\n|\t|\r| ‘)

rent_price=soup.find(‘i‘,{‘class‘:‘ncolor‘}).text

price=p.sub(‘‘,rent_price)

printprice

#房屋大小house=soup.find_all(‘li‘,{‘class‘:‘house-primary-content-li

clearfix‘})

house_content=p.sub(‘‘,house[0].text)

printhouse_content

#小區try:

house_Community=p.sub(‘‘,house[1].text)

except:

house_Community=‘‘printhouse_Community

#位置try:

house_place=p.sub(‘‘,house[2].text)

except:

house_place=‘‘printhouse_place

#設施try:

facility=soup.find(‘li‘,{‘class‘:‘house-primary-content-li

clearfix person-config‘})

facility=p.sub(‘‘,facility.text)

except:

facility=‘‘printfacility

#聯系人contact=soup.find(‘li‘,{‘class‘:‘house-primary-content-li

clearfix person-contact‘}).text

contact=p.sub(‘‘,contact)

printcontact

printupdate_time+‘\n\n\n‘#

a=[house_url,price,house_content,house_Community,house_place,title,situation,facility]f.write(‘----------------------------------------------------------------------------------\n‘)

f.write(house_url+‘\n‘+price+‘\n‘+house_content+‘\n‘+house_Community+‘\n‘+house_place+‘\n‘+title+‘\n‘+situation+‘\n‘+facility+‘\n\n‘)

if__name__==‘__main__‘:

source_url=get_source_url()

printsource_url

#

source_url=‘http://cd.58.com/zufang/0/?minprice=500_1500&key=四河‘get_new_list=get_new_list(source_url)

# print

get_new_listf=open("house_rent.txt", "w")

#先清空,然後再打開,再寫入,寫入時的方式是a(追加)

# f.truncate()

# f.close()

#

# f=open("house_rent.text",

"a")print‘正在下載下傳,請稍候。。。\n\n‘# pool =

ThreadPool(4)fornew_url inget_new_list:

new_url=new_url.encode(‘utf-8‘).decode(‘utf-8‘)

# print new_urlhouse_url_list=get_house_url(new_url)

# print

house_url_listforeach inhouse_url_list:     #本來打算使用多線程,但是總是會報:‘module‘ object has no attribute ‘_strptime‘這個奇怪的錯誤,掙紮了許久,放棄house_info(each)

#     results = pool.map(house_info,

house_url_list)

# pool.close()

# pool.join()f.close()