1、說明
使用Python爬linkedin網站,爬公司規模在1001-5000的瑞士公司資訊,隻爬公司名、公司規模、所在行業、所在地這四個資訊,且不使用linkedin官方API。
2、步驟說明
1>、模拟登入
2>、爬公司資訊
3>、擷取下一頁的URL
3、代碼片段
1>、模拟登入
從登入頁面填寫form表單,該表單中有一些JS驗證,在JS開啟時,會随着表單一起送出。不過不填也沒事,畢竟不是每個人都會開啟JS,故跳過。
這裡使用BeautifulSoup子產品來提取html頁面中的一些資訊。
def login(s):
#打開頁面
r = s.get('https://www.linkedin.com/uas/login')
#html處理
soup = BeautifulSoup(r.text, "lxml")
soup = soup.find(id="login") #查找表單
#提取表單資訊
loginCsrfParam = soup.find('input', id = 'loginCsrfParam-login')['value']
csrfToken = soup.find('input', id = 'csrfToken-login')['value']
sourceAlias = soup.find('input', id = 'sourceAlias-login')['value']
isJsEnabled = soup.find('input',attrs={"name" :'isJsEnabled'})['value']
source_app = soup.find('input', attrs={"name" :'source_app'})['value']
tryCount = soup.find('input', id = 'tryCount')['value']
clickedSuggestion = soup.find('input', id = 'clickedSuggestion')['value']
signin = soup.find('input', attrs={"name" :'signin'})['value']
session_redirect = soup.find('input', attrs={"name" :'session_redirect'})['value']
trk = soup.find('input', attrs={"name" :'trk'})['value']
fromEmail = soup.find('input', attrs={"name" :'fromEmail'})['value']
#填充表單
payload = {
'isJsEnabled':isJsEnabled,
'source_app':source_app,
'tryCount':tryCount,
'clickedSuggestion':clickedSuggestion,
'session_key':'**********',
'session_password':'*********',
'signin':signin,
'session_redirect':session_redirect,
'trk':trk,
'loginCsrfParam':loginCsrfParam,
'fromEmail':fromEmail,
'csrfToken':csrfToken,
'sourceAlias':sourceAlias
}
#送出表單
s.post('https://www.linkedin.com/uas/login-submit', data=payload)
return s
2>、爬公司資訊
linkedin将公司内容放在注釋裡面,囧!格式類似于下面這樣:
<code id="voltron_srp_main-content" style="display:none;">
<!--{"content":{"lix_header_lowercase":"control","lix_instant_co......."type":"companies"}},"status":"ok"}-->
</code>
這裡使用正則比對提取内容,然後使用json子產品格式化提取的資訊,最後提取資訊。
def getCompanins(s, start_url):
r= s.get(start_url)
#使用正則比對提取内容
html = r.text.encode("utf-8")
code = re.search(r'<code id="voltron_srp_main-content" style="display:none;"><!--.+--></code>', html).group()
code = code.replace(r'<code id="voltron_srp_main-content" style="display:none;"><!--', '')
code = code.replace(r'--></code>', '')
#使用json子產品格式化提取的資訊
code_json = json.loads(code)
#提取資訊
company_json = code_json["content"]["page"]["voltron_unified_search_json"]["search"]["results"]
company_list = []
company_list = []
for company in company_json:
company = company["company"]
name = company["fmt_canonicalName"]
fmt_industry = company["fmt_industry"]
fmt_size= company["fmt_size"]
fmt_location= company["fmt_location"]
company_list.append("%s\t%s\t%s\t%s\n" % (name, fmt_industry, fmt_size, fmt_location))
return company_list
3>、擷取下一頁的URL
擷取下一頁的連結,該連結還是在注釋裡面,原理同上步驟2>進行提取。
需要說明的是,目前頁是最後一頁時,是沒有nextpage的,是以這裡需要進行判斷一下,如沒有nextpage,則傳回NULL作為傳回标記。
def getNextPageURL(s ,start_url):
r= s.get(start_url)
html = r.text.encode("utf-8")
code = re.search(r'<code id="voltron_srp_main-content" style="display:none;"><!--.+--></code>', html).group()
code = code.replace(r'<code id="voltron_srp_main-content" style="display:none;"><!--', '')
code = code.replace(r'--></code>', '')
code_json = json.loads(code)
resultPagination = code_json["content"]["page"]["voltron_unified_search_json"]["search"]["baseData"]["resultPagination"]
#判斷是否有下一頁
if "nextPage" in resultPagination:
nextPageURL = "http://www.linkedin.com/" + resultPagination["nextPage"]["pageURL"]
else:
nextPageURL = "NULL"
return nextPageURL
4、完整代碼
這裡是将重複部分合并後的代碼。
import requests
import re
import json
from bs4 import BeautifulSoup, Comment
def login(s):
r = s.get('https://www.linkedin.com/uas/login')
soup = BeautifulSoup(r.text, "lxml")
soup = soup.find(id="login")
loginCsrfParam = soup.find('input', id = 'loginCsrfParam-login')['value']
csrfToken = soup.find('input', id = 'csrfToken-login')['value']
sourceAlias = soup.find('input', id = 'sourceAlias-login')['value']
isJsEnabled = soup.find('input',attrs={"name" :'isJsEnabled'})['value']
source_app = soup.find('input', attrs={"name" :'source_app'})['value']
tryCount = soup.find('input', id = 'tryCount')['value']
clickedSuggestion = soup.find('input', id = 'clickedSuggestion')['value']
signin = soup.find('input', attrs={"name" :'signin'})['value']
session_redirect = soup.find('input', attrs={"name" :'session_redirect'})['value']
trk = soup.find('input', attrs={"name" :'trk'})['value']
fromEmail = soup.find('input', attrs={"name" :'fromEmail'})['value']
payload = {
'isJsEnabled':isJsEnabled,
'source_app':source_app,
'tryCount':tryCount,
'clickedSuggestion':clickedSuggestion,
'session_key':'*********',
'session_password':'*********',
'signin':signin,
'session_redirect':session_redirect,
'trk':trk,
'loginCsrfParam':loginCsrfParam,
'fromEmail':fromEmail,
'csrfToken':csrfToken,
'sourceAlias':sourceAlias
}
s.post('https://www.linkedin.com/uas/login-submit', data=payload)
return s
def getCompanins(s, start_url):
r= s.get(start_url)
html = r.text.encode("utf-8")
code = re.search(r'<code id="voltron_srp_main-content" style="display:none;"><!--.+--></code>', html).group()
code = code.replace(r'<code id="voltron_srp_main-content" style="display:none;"><!--', '')
code = code.replace(r'--></code>', '')
code_json = json.loads(code)
company_json = code_json["content"]["page"]["voltron_unified_search_json"]["search"]["results"]
company_list = []
for company in company_json:
company = company["company"]
name = company["fmt_canonicalName"]
fmt_industry = company["fmt_industry"]
fmt_size= company["fmt_size"]
fmt_location= company["fmt_location"]
company_list.append("%s\t%s\t%s\t%s\n" % (name, fmt_industry, fmt_size, fmt_location))
return company_list
def getCompanins(code_json):
company_json = code_json["content"]["page"]["voltron_unified_search_json"]["search"]["results"]
company_list = []
for company in company_json:
company = company["company"]
name = company["fmt_canonicalName"]
fmt_industry = company["fmt_industry"]
fmt_size= company["fmt_size"]
fmt_location= company["fmt_location"]
company_list.append("%s\t%s\t%s\t%s\n" % (name, fmt_industry, fmt_size, fmt_location))
return company_list
def getNextPageURL(s ,start_url):
r= s.get(start_url)
html = r.text.encode("utf-8")
code = re.search(r'<code id="voltron_srp_main-content" style="display:none;"><!--.+--></code>', html).group()
code = code.replace(r'<code id="voltron_srp_main-content" style="display:none;"><!--', '')
code = code.replace(r'--></code>', '')
code_json = json.loads(code)
resultPagination = code_json["content"]["page"]["voltron_unified_search_json"]["search"]["baseData"]["resultPagination"]
if "nextPage" in resultPagination:
nextPageURL = "http://www.linkedin.com/" + resultPagination["nextPage"]["pageURL"]
else:
nextPageURL = "NULL"
return nextPageURL
def getNextPageURL(code_json):
resultPagination = code_json["content"]["page"]["voltron_unified_search_json"]["search"]["baseData"]["resultPagination"]
if "nextPage" in resultPagination:
nextPageURL = "http://www.linkedin.com/" + resultPagination["nextPage"]["pageURL"]
else:
nextPageURL = "NULL"
return nextPageURL
def search(s ,start_url):
with open("result", "wb") as of:
while True:
if start_url == "NULL":
break
r= s.get(start_url)
html = r.text.encode("utf-8")
code = re.search(r'<code id="voltron_srp_main-content" style="display:none;"><!--.+--></code>', html).group()
code = code.replace(r'<code id="voltron_srp_main-content" style="display:none;"><!--', '')
code = code.replace(r'--></code>', '')
code_json = json.loads(code)
start_url = getNextPageURL(code_json)
company_list = getCompanins(code_json)
for line in company_list:
of.write(line)
if __name__ == '__main__':
s = requests.session()
s = login(s)
start_url = r"http://www.linkedin.com/vsearch/c?type=companies&keywords=*&orig=FCTD&rsid=4625091521449736984098&pageKey=oz-winner&trkInfo=tarId%3A1449736114717&search=Search&f_CCR=gb%3A4634&openFacets=N,CCR,JO,CS&f_CS=G"
search(s ,start_url)
5、結果
name industry size location
Dyson Electrical/Electronic Manufacturing 1001-5000 employees Swindon, United Kingdom
National Trust Nonprofit Organization Management 1001-5000 employees Swindon, United Kingdom
STFC Research 1001-5000 employees Swindon, United Kingdom
Honda of the UK Manufacturing Ltd. Automotive 1001-5000 employees Swindon, United Kingdom
NERC Research 1001-5000 employees Swindon, United Kingdom
Swindon Borough Council Government Administration 1001-5000 employees Swindon, United Kingdom
Smiths News Wholesale 1001-5000 employees Swindon, United Kingdom
Defence Academy of the UK Defense & Space 1001-5000 employees Swindon, United Kingdom
ARI UK Automotive 1001-5000 employees Swindon, United Kingdom
Aster Group UK Nonprofit Organization Management 1001-5000 employees Swindon, United Kingdom
RWE Systems UK Utilities 1001-5000 employees Swindon, United Kingdom
Great Western Ambulance Service NHS Trust Hospital & Health Care 1001-5000 employees Swindon, United Kingdom
EOH (Europe) Information Technology and Services 1001-5000 employees Swindon, United Kingdom
Westinghouse Rail Systems Australia Computer Software 1001-5000 employees Swindon, United Kingdom
St Johns Marlborough Education Management 1001-5000 employees Swindon, United Kingdom