天天看點

python lxml 擷取企業資訊 new

#coding:utf-8
 
 
import numpy as np
import pandas as pd
 
from lxml import etree
import csv
import requests
 
col = "ABCDEFGHIJKLM"
row = 1
 
nameArray1 = np.array([])
nameArray3 = np.array([])
nameArray4 = np.array([])
nameArray5 = np.array([])
nameArray6 = np.array([])
nameArray7 = np.array([])
 
nameArray8 = np.array([])
nameArray10 = np.array([])
nameArray11 = np.array([])
nameArray12 = np.array([])
nameArray13 = np.array([])
nameArray14 = np.array([])
nameArray15 = np.array([])
nameArray16 = np.array([])
nameArray17 = np.array([])
 
nameArrayEnd1 = np.array([])
nameArrayEnd3 = np.array([])
 
 
def getData(idUrl):
 
    global nameArray1
    global nameArray3
    global nameArray4
    global nameArray5
    global nameArray6
    global nameArray7
    
    global nameArray8
    global nameArray10
    global nameArray11
    global nameArray12
    global nameArray13
    global nameArray14
    global nameArray15
    global nameArray16
    global nameArray17
    
    global nameArrayEnd1
    global nameArrayEnd3
 
    ht = 1
    while(1):
        try:
            ht = requests.get(url = idUrl, timeout=(5,15))
        except:
            print("wait  "+idUrl)
            continue
        
        if ht.status_code == 200:
            break
        else:
            print(ht)
            print("wait  "+idUrl)
        
    html=etree.HTML(ht.text)
    #html = etree.parse(txt, etree.HTMLParser(encoding='gbk'))
    #res = html.xpath('//table[@class ="detailTable"]/tbody/tr//td')
    print(ht.text)
    res = html.xpath('//td')
    for r in res:
        print(r)

    faren = ""
    try:
        faren = res[14].text
    except:
        print("error return");
        return;
    res = html.xpath('//td[@align = "center"]')
 
    i = 0
    end = 2000
 
 
    '''
    nameArray1 = np.append(nameArray1, res[1].text.replace(' ', ''))
    nameArray3 = np.append(nameArray3, res[3].text.replace(' ', ''))
    nameArray4 = np.append(nameArray4, res[4].text.replace(' ', ''))
    nameArray5 = np.append(nameArray5, res[5].text.replace(' ', ''))
    nameArray6 = np.append(nameArray6, res[6].text.replace(' ', ''))
    nameArray7 = np.append(nameArray7, faren.replace(' ', ''))
    '''
 
 
    for a in res:
        #print(i)
        i += 1
 
        if a.text is not None:
            astr = a.text.replace(' ', '')
            #astr
            #print(astr)
 
            if astr== ('安全許可資訊'):
                end = i
        if i > end + 3:
            break
    print("eeeeeeeeeeeeeeeeeeee" + str(end))
    i = 7
    for m in range(int(end/10)):
        
        if m == 0:
            print(res[1].text.replace(' ', ''))
            nameArray1 = np.append(nameArray1, res[1].text.replace(' ', ''))
            nameArray3 = np.append(nameArray3, res[3].text.replace(' ', ''))
            nameArray4 = np.append(nameArray4, res[4].text.replace(' ', ''))
            nameArray5 = np.append(nameArray5, res[5].text.replace(' ', ''))
            nameArray6 = np.append(nameArray6, res[6].text.replace(' ', ''))
            nameArray7 = np.append(nameArray7, faren.replace(' ', ''))
 
            nameArrayEnd1 = np.append(nameArrayEnd1, res[end + 1].text.replace(' ', ''))
            nameArrayEnd3 = np.append(nameArrayEnd3, res[end + 3].text.replace(' ', ''))
        else:
            nameArray1 = np.append(nameArray1, '')
            nameArray3 = np.append(nameArray3, '')
            nameArray4 = np.append(nameArray4, '')
            nameArray5 = np.append(nameArray5, '')
            nameArray6 = np.append(nameArray6, '')
            nameArray7 = np.append(nameArray7, '')
 
            nameArrayEnd1 = np.append(nameArrayEnd1, '')
            nameArrayEnd3 = np.append(nameArrayEnd3, '')
 
        i = i + 1
        nameArray8 = np.append(nameArray8, res[i].text.replace(' ', ''))
        i = i + 2
        nameArray10 = np.append(nameArray10, res[i].text.replace(' ', ''))
        i = i + 1
        nameArray11 = np.append(nameArray11, res[i].text.replace(' ', ''))
        i = i + 1
        nameArray12 = np.append(nameArray12, res[i].text.replace(' ', ''))
        i = i + 1
        nameArray13 = np.append(nameArray13, res[i].text.replace(' ', ''))
        i = i + 1
        nameArray14 = np.append(nameArray14, res[i].text.replace(' ', ''))
        i = i + 1
        nameArray15 = np.append(nameArray15, res[i].text.replace(' ', ''))
 
        i = i + 1
        try:
            nameArray16 = np.append(nameArray16, res[i].text.replace(' ', ''))
        except:
            nameArray16 = np.append(nameArray16, res[i].text)
            print("res" + str(res[i].text))
 
        i = i + 1
        try:
            nameArray17 = np.append(nameArray17, res[i].text.replace(' ', ''))
        except:
            nameArray17 = np.append(nameArray17, res[i].text)
            print("res" + str(res[i].text))
 
 
    global yema    
    writefile = "shuju" + str(yema) +".csv"
    data = [nameArray1, nameArray3, nameArray4, nameArray5, nameArray6, nameArray7, \
                nameArray8, nameArray10, nameArray11, nameArray12, nameArray13, nameArray14, \
                nameArray15, nameArray16, nameArray17, nameArrayEnd1, nameArrayEnd3]
    data = np.transpose(data)
    ser2 = pd.DataFrame(data, columns=['企業名稱', '營業證', '位址', '注冊資本', '組織機構代碼号', '法人', \
                                           '序号', '編号', '日期', '狀态', '資質序列', '類别', '等級', '發證機關', '核準日期', '證書', '有效期'])
    #ser2 = pd.DataFrame(data)
    print("write file")
    
    ser2.to_csv(writefile, encoding="utf_8_sig")
 
 
 
 
import sys
 
yema = 0
 
def run():
    global yema
 
    start = int(sys.argv[1])
    end = int(sys.argv[2])
 
    #start = 168
    #end = 2000   
    #if int(yema) > 3:
    #    print("> 3")
 
    
    #    return
    #yema = 3
 
    global nameArray1
    global nameArray3
    global nameArray4
    global nameArray5
    global nameArray6
    global nameArray7
    
    global nameArray8
    global nameArray10
    global nameArray11
    global nameArray12
    global nameArray13
    global nameArray14
    global nameArray15
    global nameArray16
    global nameArray17
    
    global nameArrayEnd1
    global nameArrayEnd3
    
    for i in range(start, end+1):
        yema = i
        url = "http://124.115.170.171:7001/PDR/network/informationSearch/informationSearchList?name=&type=&certDeadline=&certReviewUnit=&regType=&pid1=&pid2=&pid3=&pageNumber=" + str(yema)+"&libraryName=enterpriseLibrary"
            
        try:
            res = requests.get(url = url, timeout=(5,15))
        except:
            
            print("waiting..." + url)
            i = i - 1
            continue
        
        
        print(res.status_code)
        #print(res.text)
        html=etree.HTML(res.text)
 
        aRes = html.xpath('//td/p/a/@onclick')
        t = 0
        for a in enumerate(aRes):
 
            #print(type(a[1]))
            ss = a[1].split(',')
            name = ""
            aid = ""
            code = ""
            for s0 in ss:
                print(s0)
            name = ss[0][6:-1]    
            aid = ss[1][1:-2]
            code = ss[2][1:-1]
            
            print(name)
            print(aid)
            print(code)
 
            idUrl = 'http://124.115.170.171:7001/PDR/network/Enterprise/Informations/qyszit?enid=' + aid + "&name=" + name + "&org_code=" + code + "&type="
 
            print(idUrl)
 
            getData(idUrl)
        
        nameArray1 = np.array([])
        nameArray3 = np.array([])
        nameArray4 = np.array([])
        nameArray5 = np.array([])
        nameArray6 = np.array([])
        nameArray7 = np.array([])
 
        nameArray8 = np.array([])
        nameArray10 = np.array([])
        nameArray11 = np.array([])
        nameArray12 = np.array([])
        nameArray13 = np.array([])
        nameArray14 = np.array([])
        nameArray15 = np.array([])
        nameArray16 = np.array([])
        nameArray17 = np.array([])
 
        nameArrayEnd1 = np.array([])
        nameArrayEnd3 = np.array([])
        
if __name__ == '__main__':
    run()