1.é¦å å æ¨¡æç»å½ï¼ä¸»è¦ç®çæ¯è·åcookiesï¼éè¦çæ¯cookieséé¢çânameâåâvalueâ
æ¹æ³1ï¼å¸¸è§å©ç¨requestæ¹æ³è¿è¡æ¨¡æç»å½ï¼è·åcookieJarï¼å©ç¨requests.utils.dict_from_cookiejar(cookiesjar)æ¹æ³ï¼å°cookiehar转å为dictç±»åï¼å¹¶ä¼ å ¥scrapy.FormRequestä¸è¿è¡æ°æ®æäº¤ï¼åè®°å ä¸headersï¼ï¼å®ç°æ¨¡æç»å½ï¼æåcallbackåå»start_urlsï¼è°ç¨parserç彿°è¿è¡æ°æ®è§£ææåã
def get_xsrf(self):
'''_xsrf æ¯ä¸ä¸ªå¨æååçåæ°'''
index_url = 'https://www.zhihu.com'
# è·åç»å½æ¶éè¦ç¨å°ç_xsrf
index_page = self.session.get(index_url, headers=self.headers)
html = index_page.text
pattern = r'name="_xsrf" value="(.*?)"'
# è¿éç_xsrf è¿åçæ¯ä¸ä¸ªlist
_xsrf = re.findall(pattern, html)
return _xsrf[0]
# è·åéªè¯ç
def get_captcha(self):
t = str(int(time.time() * 1000))
captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + "&type=login"
r = self.session.get(captcha_url, headers=self.headers)
with open('captcha.jpg', 'wb') as f:
f.write(r.content)
f.close()
# ç¨pillow ç Image æ¾ç¤ºéªè¯ç
# å¦ææ²¡æå®è£
pillow å°æºä»£ç æå¨çç®å½å»æ¾å°éªè¯ç ç¶åæå¨è¾å
¥
try:
im = Image.open('captcha.jpg')
im.show()
im.close()
except:
print(u'è¯·å° %s ç®å½æ¾å°captcha.jpg æå¨è¾å
¥' % os.path.abspath('captcha.jpg'))
captcha = input("please input the captcha\n>")
return captcha
def login(self,secret='xxxxx', account='xxxxx'):
_xsrf = self.get_xsrf()
self.headers["X-Xsrftoken"] = _xsrf
self.headers["X-Requested-With"] = "XMLHttpRequest"
post_url = 'https://www.zhihu.com/login/phone_num'
# éè¿è¾å
¥çç¨æ·å夿æ¯å¦æ¯ææºå·
postdata = {
'_xsrf': _xsrf,
'password': secret,
'phone_num': account
}
# ä¸éè¦éªè¯ç ç´æ¥ç»å½æå
login_page = self.session.post(post_url, data=postdata, headers=self.headers)
login_code = login_page.json()
if login_code['r'] == 1:
# ä¸è¾å
¥éªè¯ç ç»å½å¤±è´¥
# 使ç¨éè¦è¾å
¥éªè¯ç çæ¹å¼ç»å½
postdata["captcha"] = self.get_captcha()
login_page = self.session.post(post_url, data=postdata, headers=self.headers)
login_code = login_page.json()
print(login_code['msg'])
return self.session.cookies
# ä¿å cookies å°æä»¶ï¼
# 䏿¬¡å¯ä»¥ä½¿ç¨ cookie ç´æ¥ç»å½ï¼ä¸éè¦è¾å
¥è´¦å·åå¯ç
def start_requests(self): # éåstart_requets,å®æç¨æ·ç»å½æä½
#ä»ç»é页é¢è·åhtmlä¿¡æ¯,è®°ä½headersä¼ å
¥
cookiesjar = self.login() #ç±return self.session.cookiesè·å¾ #ä¸å¯ä»¥ä¼ éè¿scrapy.FormDataä¸
cookiesDict = requests.utils.dict_from_cookiejar(cookiesjar) #cookiesDictå¯ç¨
#return [scrapy.FormRequest('https://www.zhihu.com/',cookies=cookiesDict,headers=self.headers,callback=self.after_login)] #cookiesDictå¯ä»¥ä¼ å
¥å¼
return [scrapy.FormRequest('https://www.zhihu.com/',cookies=cookiesDict,headers=self.headers,callback=self.after_login)]
def after_login(self,response): #æ¼æ¥ä¼åæçstart_requests
for url in self.start_urls:
yield scrapy.Request(url,dont_filter=True,headers=self.headers) #é»è®¤è°ç¨parse
æ¹æ³2ï¼å©ç¨seleniumå®ç°æ¨¡æç»å½ï¼è·åcookiesï¼èªå®ä¹set_cookies彿°ï¼æ ååcookiesï¼å¹¶ä¼ å ¥scrapy.FormRequestä¸è¿è¡æ°æ®æäº¤ï¼åè®°å ä¸headersï¼ï¼å®ç°æ¨¡æç»å½ï¼æåcallbackåå»start_urlsï¼è°ç¨parserç彿°è¿è¡æ°æ®è§£ææåã
def login(self,name, passwd):
url = 'https://www.zhihu.com/#signin'
# è¿éå¯ä»¥ç¨ChromeãPhantomjsçï¼å¦ææ²¡æå å
¥ç¯å¢åéï¼éè¦æå®å
·ä½çä½ç½®
driver = webdriver.Firefox()
driver.set_window_size(1200, 1200)
driver.get(url)
print('å¼å§ç»å½')
use_passwd_login = driver.find_element_by_class_name('signin-switch-password').click()
login_button = driver.find_element_by_class_name('active').click()
name_field = driver.find_element_by_name('account')
name_field.send_keys(name)
passwd_field = driver.find_element_by_name('password')
passwd_field.send_keys(passwd)
auto_login = driver.find_element_by_xpath('//button[contains(@class,"sign-button")]').click()
time.sleep(10)
return driver.get_cookies()
def set_cookies(self,drive_cookies):
#æ ååcookiesï¼éæ°æé
dict_cookies = {}
for each in drive_cookies:
dict_cookies[each['name']] = each['value']
return dict_cookies
def start_requests(self): # éåstart_requets,å®æç¨æ·ç»å½æä½
#ä»ç»é页é¢è·åhtmlä¿¡æ¯,è®°ä½headersä¼ å
¥
login_name = 'xxxxx'
login_passwd = 'xxxxxx'
cookies = self.login(login_name, login_passwd)
return [scrapy.FormRequest('https://www.zhihu.com/',cookies=cookies,headers=self.headers,callback=self.after_login)]
def after_login(self,response): #æ¼æ¥ä¼åæçstart_requests
for url in self.start_urls:
yield scrapy.Request(url,dont_filter=True,headers=self.headers) #é»è®¤è°ç¨parse
æ¹æ³3ï¼æç´æ¥ç®åçæ¹æ³ï¼ï¼ç´æ¥å©ç¨æµè§å¨ä¸çcookiesï¼å¹¶ä¼ å ¥scrapy.FormRequestä¸è¿è¡æ°æ®æäº¤ï¼åè®°å ä¸headersï¼ï¼å®ç°æ¨¡æç»å½ï¼æåcallbackåå»start_urlsï¼è°ç¨parserç彿°è¿è¡æ°æ®è§£ææåã
cookies = {
xxxxï¼xxxx
xxxxï¼xxxx
xxxxï¼xxxx
}
def start_requests(self): # éåstart_requets,å®æç¨æ·ç»å½æä½
#ä»ç»é页é¢è·åhtmlä¿¡æ¯,è®°ä½headersä¼ å
¥
return [scrapy.FormRequest('https://www.zhihu.com/',headers=self.header,cookies=self.cookies,callback=self.after_login)]
def after_login(self,response): #æ¼æ¥ä¼åæçstart_requests
for url in self.start_urls:
yield scrapy.Request(url,dont_filter=True,headers=self.header) #é»è®¤è°ç¨parse
2.å®ä¹itemsï¼å®ä¹def get_insert(self)ï¼ç¨äºpipeline䏿°æ®å¤ç
class zhihu_Loader(ItemLoader):
#å®ä¹é»è®¤itemloaderè¾åº
default_output_processor = TakeFirst()
pass
class ZhihuAnswerItem(scrapy.Item):
author_name = scrapy.Field()
author_id = scrapy.Field()
answer_content = scrapy.Field(
input_processor=MapCompose(soup)
)
answer_url = scrapy.Field()
question_id = scrapy.Field()
answer_parise_num = scrapy.Field()
answer_comments_num = scrapy.Field()
answer_creat_time = scrapy.Field(
input_processor=MapCompose(timestamp_covert_to_datetime)
)
answer_update_time = scrapy.Field(
input_processor=MapCompose(timestamp_covert_to_datetime)
)
answer_crawl_time = scrapy.Field()
def get_insert(self):
# ON DUPLICATE KEY UPDATE answer_content=VALUES (answer_content),answer_parise_num=VALUES (answer_parise_num),answer_comments_num=VALUES (answer_comments_num),answer_update_time=VALUES (answer_update_time) è§£å³çæ¡æ´æ°æ¶ï¼éåç¬åï¼ä½ä¸»é®å²çªé®é¢
insert_sql = '''
insert into answer_database(author_name, author_id, answer_content, answer_url, question_id,answer_parise_num,answer_comments_num,answer_creat_time,answer_update_time,answer_crawl_time)VALUES( %s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE answer_content=VALUES (answer_content),answer_parise_num=VALUES (answer_parise_num),answer_comments_num=VALUES (answer_comments_num),answer_update_time=VALUES (answer_update_time)
'''
params = (self['author_name'],self['author_id'],self['answer_content'],self['answer_url'],self['question_id'],self['answer_parise_num'],self['answer_comments_num'],self['answer_creat_time'],self['answer_update_time'],self['answer_crawl_time'])
return insert_sql,params
pass
class ZhihuQuestionItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
question_id = scrapy.Field()
question_url = scrapy.Field()
question_title = scrapy.Field()
topic = scrapy.Field(
output_processor = list_to_str #éåoutput_processor,è¦çåæç
)
answer_num = scrapy.Field(
input_processor=MapCompose(get_num)
)
comment_num = scrapy.Field(
input_processor=MapCompose(get_num)
)
focus_num = scrapy.Field()
watch_num = scrapy.Field()
content = scrapy.Field()
def get_insert(self):
insert_sql = '''
insert into question_database(question_id, question_url, title, topic, answer_num, comment_num, focus_num, watch_num, content)VALUES( %s,%s,%s,%s,%s,%s,%s,%s,%s)
'''
params = (self['question_id'], self['question_url'], self['question_title'], self['topic'], self['answer_num'],
self['comment_num'], self['focus_num'], self['watch_num'], self['content'])
return insert_sql,params
pass
3.ç¼åè§£æå½æ°ï¼è·åä¸å¡éè¦çå 容ï¼
å两个é¨åï¼
1ï¼parserï¼è§£æç½é¡µï¼è·åé®é¢urlsï¼æä¸ªé®é¢ï¼ä¸ºå¥ææ¯æ¬¡é½åªè½è·å3个ææçé®é¢urlï¼è¿ä¸ªä¼¼ä¹æ¯ç¥ä¹å©ç¨js卿å è½½çï¼å¨networkä¸åç°batchï¼æ¯post请æ±ï¼post requset_payloadï¼ä¼¼ä¹ç¨äºå è½½æ°çurlï¼ä½æ¯Content-Type:application/x-protobufï¼å¯¼è´æçå°é¢request_payloadæ¯ä¹±ç ï¼è¿ä¸ªä»¥ååç ç©¶ï¼ï¼å©ç¨åè°å½æ°ï¼ä¼ å¼ç»question_parser
def parse(self, response): #é»è®¤è°ç¨parse彿°
all_urls = response.xpath('.//@href').extract()
#all_urls = response.css('a::attr(href)').extract()
all_urls = [urljoin(response.url,url) for url in all_urls]
#è¿æ»¤æjavascriptçå
¶ä»æ ç¨url
#all_urls = filter(lambda x:True if 'https' in x else False,all_urls)
#è¿æ»¤æéquestionçurl
for url in all_urls:
print(url)
match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",url)
if match_obj:
requests_url = match_obj.group(1) #è·åurl
question_id = match_obj.group(2) #è·åé®é¢id
yield scrapy.Request(url,headers=self.header,meta={'question_id':question_id},callback=self.question_parser)
2ï¼question_parserï¼ä¼ å ¥å·²ç»è·åçurlï¼æåéè¦çå 容è¿itemä¸ï¼å©ç¨item_loader = zhihu_Loader(item=ZhihuQuestionItem(), response=response)声æitem_loaderï¼å©ç¨question_item = item_loader.load_item()å°å¼loadè¿itemä¸ï¼æååè®°yield question_itemï¼ç»å°¾å°æé çanswer_json_urlå©ç¨åè°å½æ°ä¼ å¼ç»answer_parser
def question_parser(self,response):
item_loader = zhihu_Loader(item=ZhihuQuestionItem(), response=response) # response æ¯scrapyè¿åçresponse
item_loader.add_value("question_id", response.meta['question_id'])
item_loader.add_value("question_url", response.url)
item_loader.add_xpath("question_title",'//h1[@class="QuestionHeader-title"]/text()') #å¨scrapy.Fieldä¸å¤ç
item_loader.add_xpath("topic", '//div[@class="QuestionHeader-topics"]//text()') #å©ç¨â//âæ è¯æä¸çå¨è¯¢èç¹ #å¤ä¸ªtopicï¼éè¦å¤çï¼å¨scrapy.Fieldéè¿è¡å¤ç
item_loader.add_xpath("answer_num", '//a[@class="QuestionMainAction"]//text()') #å¤ä¸ªtopicï¼éè¦å¤çï¼å¨scrapy.Fieldéè¿è¡å¤ç,å°äº3ä¸ªçæ¡ï¼å¯ä»¥num = 0
item_loader.add_xpath("comment_num", '//button [@class="Button Button--plain"]/text()')
item_loader.add_xpath("watch_num", '//div[@class="NumberBoard-item"]/div[2]/text()')
item_loader.add_xpath("focus_num", '//button[@class="Button NumberBoard-item Button--plain"]/div[2]/text()')
item_loader.add_xpath("content", '//div[@class="QuestionRichText QuestionRichText--expandable QuestionRichText--collapsed"]/div//text()')
question_item = item_loader.load_item()
è¿ä¸ªå°æ¹å©ç¨æå è·åäºé®é¢çåçjsonå ï¼ç´æ¥æé json_urlï¼ä¼ ç»answer_parser彿°
answer_json_url = "https://www.zhihu.com/api/v4/questions/{0}/answers?include=data[*].is_normal,admin_closed_com" \
"ment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,is_sticky,collapsed_by" \
",suggest_edit,comment_count,can_comment,content,editable_content,voteup_count,reshipment_settings,comment_permi" \
"ssion,created_time,updated_time,review_info,question,excerpt,relationship.is_authorized,is_author,voting,is_thanked,is" \
"_nothelp,upvoted_followees;data[*].mark_infos[*].url;data[*].author.follower_count,badge[?(type=best_answerer)].topics&offset" \
"={1}&limit={2}&sort_by=default".format(response.meta['question_id'],0,20) #注æformatç¨æ³
yield scrapy.Request(answer_json_url, headers=self.header, callback=self.answer_parser)
yield question_item
3ï¼answer_parserï¼åquestion_parserå·®ä¸å¤ï¼ä½æ¯æ²¡æç¨item_loaderï¼å ä¸ºæ¯æ¬¡json_dataæ¯è·å20æ¡å 容ï¼éè¿if is_end==Falseï¼å¤ææ¯å¦è¿æåç»æ°æ®éè¿ yield scrapy.Requestè¿è¡éå½å¾ªç¯ï¼è·åæææ°æ®ã
def answer_parser(self,response):
#å©ç¨jsonå¤çanswer
answer_json = json.loads(response.text)
#is_startç¨äºå¤æjson页颿¯å¦æ¯æåä¸é¡µ
is_start = answer_json['paging']['is_start']
is_end = answer_json['paging']['is_end']
next_url = answer_json['paging']['next']
#è·åæ¯ä¸ªçæ¡
for answer in answer_json['data']:
answer_item = ZhihuAnswerItem()
answer_item['author_name'] = answer['author']['name']
answer_item['author_id'] = answer['author']['id']
answer_item['answer_content'] = answer['content']
answer_item['answer_url'] = answer['url']
answer_item['question_id'] = answer['question']['id']
answer_item['answer_parise_num'] = answer['voteup_count']
answer_item['answer_comments_num'] = answer['comment_count']
answer_item['answer_creat_time'] = datetime.datetime.fromtimestamp(answer['created_time']) #æ¶é´æ³è½¬å为æ¶é´
answer_item['answer_update_time'] = datetime.datetime.fromtimestamp(answer['updated_time']) #æ¶é´æ³è½¬å为æ¶é´
answer_item['answer_crawl_time'] = datetime.date.today()
yield answer_item
if is_end==False:
yield scrapy.Request(next_url,headers=self.header,callback=self.answer_parser)
4.éç¨å¼æ¥æ¹æ³å°æ°æ®æå ¥mysqlä¸
1ï¼æ¹æ³1ï¼def do_insert(self, cursor, item)æ¹æ³ï¼å ·æé«é ç½®æ§ï¼
class MysqlTwistedPipline_getSQLfunc(object):
# éç¨å¼æ¥æå
¥æ°æ®åºä¸
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings):
dbparms = dict(
host=settings["MYSQL_HOST"],
port=settings["MYSQL_PORT"],
user=settings["MYSQL_USER"],
passwd=settings["MYSQL_PASSWORD"],
db=settings["MYSQL_DB"],
use_unicode=True,
charset=settings["MYSQL_CHARSET"],
)
dbpool = adbapi.ConnectionPool("pymysql", **dbparms)
return cls(dbpool)
def process_item(self, item, spider):
# 使ç¨twistedå°mysqlæå
¥åæå¼æ¥
query = self.dbpool.runInteraction(self.do_insert, item)
query.addErrback(self.handle_error)
def handle_error(self, failure):
# å¤ç弿¥æå
¥çå¼å¸¸
print(failure)
def do_insert(self, cursor, item): # ä¸è½ä¸æ¬¡äºä¸¤ä¸ªéæobjectï¼ä¸ç¶åªä¼æ§è¡ä¸ä¸ª
#彿°åå¨äºitemä¸
insert_sql, params = item.get_insert()
cursor.execute(insert_sql, params)
2ï¼æ¹æ³2ï¼ç´æ¥å¨pipelineä¸åæ»ï¼æ¹æ³ç®åï¼é ç½®æ§ä½
class MysqlTwistedPipline(object):
#éç¨å¼æ¥æå
¥æ°æ®åºä¸
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings):
dbparms = dict(
host=settings["MYSQL_HOST"],
port=settings["MYSQL_PORT"],
user=settings["MYSQL_USER"],
passwd=settings["MYSQL_PASSWORD"],
db=settings["MYSQL_DB"],
use_unicode=True,
charset=settings["MYSQL_CHARSET"],
)
dbpool = adbapi.ConnectionPool("pymysql", **dbparms)
return cls(dbpool)
def process_item(self, item, spider):
#使ç¨twistedå°mysqlæå
¥åæå¼æ¥
query = self.dbpool.runInteraction(self.do_insert, item)
query.addErrback(self.handle_error)
def handle_error(self, failure):
# å¤ç弿¥æå
¥çå¼å¸¸
print(failure)
def do_insert(self, cursor, item): #ä¸è½ä¸æ¬¡äºä¸¤ä¸ªéæobjectï¼ä¸ç¶åªä¼æ§è¡ä¸ä¸ª
# å
·ä½æå
¥æ°æ®
if item.__class__.__name__ == 'ZhihuAnswerItem':
insert_sql = '''
insert into answer_database(author_name, author_id, answer_content, answer_url, question_id,answer_parise_num,answer_comments_num,answer_creat_time,answer_update_time,answer_crawl_time)VALUES( %s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
'''
cursor.execute(insert_sql, (item['author_name'],item['author_id'],item['answer_content'],item['answer_url'],item['question_id'],item['answer_parise_num'],item['answer_comments_num'],item['answer_creat_time'],item['answer_update_time'],item['answer_crawl_time']))
if item.__class__.__name__ == 'ZhihuQuestionItem': #å½ä¼ å
¥çiremæ¯ZhihuQuestionItemæ¶æ§è¡
insert_sql = '''
insert into question_database(question_id, question_url, title, topic, answer_num, comment_num, focus_num, watch_num, content)VALUES( %s,%s,%s,%s,%s,%s,%s,%s,%s)
'''
cursor.execute(insert_sql, (
item['question_id'], item['question_url'], item['question_title'], item['topic'], item['answer_num'],
item['comment_num'], item['focus_num'], item['watch_num'], item['content']))