Pandas基礎
- Series
- DataFrame
- 篩選資料
- 指派及操作
- 處理空值
- concat資料合并
-
- merge合并
-
Series
import pandas as pd
s1 = pd.Series([4,-7,-5,3]) #建立一個series,索引為預設值
print(s1)
print(s1.values) #series value
print(s1.index) #series index
print("********")
s2 = pd.Series([4.0,6.5,-0.5,4.2],index=['d','b','a','c'])
print(s2)
print(s2['a']) #根據索引取值
print(s2[['a','b','c','d']])
print("********")
#Series可以看作是一個定長的有序字典
dic1 = {'apple':5, 'pen':3, 'applepen':10}
s3 = pd.Series(dic1)
print(s3)
DataFrame
data = {'year':[2014,2015,2016,2017],
'income':[10000,30000,50000,80000],
'pay':[5000,20000,30000,3000]}
df1 = pd.DataFrame(data)
# 一些屬性
print(df1.columns) #列
print("********")
print(df1.index) #行
print("********")
print(df1.values)
print("********")
print(df1.describe())
print("********")
print(df1.T)
import pandas as pd
import numpy as np
df2 = pd.DataFrame(np.arange(12).reshape((3,4)))
df3 = pd.DataFrame(np.arange(12).reshape((3,4)),index=['a','c','b'],columns=[2,33,44,5])
df4 = df3.sort_index(axis=1) #列排序
df5 = df3.sort_index(axis=0) #行排序
df6 = df3.sort_values(by=44) #對單獨某一個列排序
篩選資料
import pandas as pd
import numpy as np
dates = pd.date_range('20170101',periods=5)
df1 = pd.DataFrame(np.arange(15).reshape((5,3)),index=dates,columns=['A','B','C'])
# 将DataFrame的列擷取為一個Series
print(df1['A']) #或者寫為: df1.A
print("********")
print(df1[0:2]) #取0-1行
print("********")
print(df1['20170102':'20170104'])
print("********")
#通過标簽選擇資料
print(df1.loc['20170102'])
print("********")
print(df1.loc['20170101',['A','C']])
print("********")
print(df1.loc[:,['A','B']])
print("********")
#通過位置選擇資料
print(df1.iloc[2]) #第二行
print("********")
print(df1.iloc[1:3,2:3])
print("********")
print(df1.iloc[[1,2,4],[1,2]])
指派及操作
import pandas as pd
import numpy as np
dates = np.arange(20170101,20170107)
df1 = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
df1.iloc[2,2] = 100
df1.loc[20170102,'B'] = 200
df1[df1.A>10] = 0 #快速找到A這一列大于10的行指派為0
df1.A[df1.A==0] = 1
df1['E'] = 10 #添加一列
df1['F'] = pd.Series([1,2,3,4,5,6],index=dates)#添加一列
df1.loc[20170107,['A','B','C']] = [1,2,3] #添加一行
s1 = pd.Series([1,2,3,4,5,6],index=['A','B','C','D','E','F'])
s1.name = 'S1'
df2 = df1.append(s1)
df1.insert(1,'G',df2['E'])#在第1列插入索引為G的df2中的E列
g = df1.pop('G')#彈出G列
df1.insert(6,'G',g)#在最後插入
del df1['G']#删除G列
df2 = df1.drop(['A','B'],axis=1)#删除AB列 1代表列
df2 = df1.drop([20170101,20170102],axis=0)#删除20170101,20170102行 0代表行
處理空值
# axis=[0,1] 0代表行,1代表列。
# how=['any','all'] any任意一個或多個 all全部為空值
df2.dropna(axis=0,how='any') #判斷哪一行有空值,删除
df2.dropna(axis=1,how='any') #判斷哪一列有空值,删除
df2.fillna(value=0) #把空值指派為0
df2.isnull() #檢視空值
np.any(df2.isnull()) #隻要有一個或多個空值就會傳回true
np.all(df2.isnull()) #所有為空值才傳回true
concat資料合并
df1 = pd.DataFrame(np.arange(12).reshape((3,4)),columns=['a','b','c','d'])
df2 = pd.DataFrame(np.arange(12,24).reshape((3,4)),columns=['a','b','c','d'])
df3 = pd.DataFrame(np.arange(24,36).reshape((3,4)),columns=['a','b','c','d'])
df4 = pd.concat([df1,df2,df3],axis=0) #縱向合并
df5 = pd.concat([df1,df2,df3],axis=0,ignore_index=True) #縱向合并,并且不考慮原來的index
df6 = pd.concat([df1,df2,df3],axis=1) #橫向合并
外接 内接
df1 = pd.DataFrame(np.arange(12).reshape((3,4)),columns=['a','b','c','f'])
df2 = pd.DataFrame(np.arange(12,24).reshape((3,4)),columns=['a','c','d','e'])
#合并兩個表,缺少的部分填充NaN
df3 = pd.concat([df1,df2],join='outer',ignore_index=True)
#合并兩個表,缺少的部分去掉,即保留共有的部分
df4 = pd.concat([df1,df2],join='inner',ignore_index=True)
df1 = pd.DataFrame(np.arange(12).reshape((3,4)),columns=['a','b','c','f'])
df2 = pd.DataFrame(np.arange(12,24).reshape((4,3)),columns=['a','c','d'])
df3 = pd.concat([df1,df2],axis=1,join_axes=[df1.index]) #橫向合并,index使用df1的index
df4 = pd.concat([df1,df2],axis=1) #橫向合并
merge合并
df_left = pd.DataFrame({'key':['K0','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
df_right = pd.DataFrame({'key':['K0','K1','K2','K3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
df = pd.merge(df_left,df_right,on='key')
外連接配接 内連接配接 左連接配接 右連接配接
df1 = pd.DataFrame({'key1':['K0','K0','K1','K2'],'key2':['K0','K1','K0','K1'],
'A':['A0','A1','A2','A3'],'B':['B0','B1','B2','B3']})
df2 = pd.DataFrame({'key1':['K0','K1','K1','K3'],'key2':['K0','K0','K0','K0'],
'C':['C0','C1','C2','C3'],'D':['D0','D1','D2','D3']})
#how預設inner how = ['left','right','inner','outer']
df_outer = pd.merge(df1,df2,on=['key1','key2'],how='outer')
df_inner = pd.merge(df1,df2,on=['key1','key2'],how='inner')
df_left = pd.merge(df1,df2,on=['key1','key2'],how='left')
df_right = pd.merge(df1,df2,on=['key1','key2'],how='right')
#顯示merge資訊
df_outer1 = pd.merge(df1,df2,on=['key1','key2'],how='outer',indicator=True)
#顯示merge資訊,表頭中名字為indicator_column
df_outer2 = pd.merge(df1,df2,on=['key1','key2'],how='outer',indicator='indicator_column')
df1 = pd.DataFrame({'A':['A0','A1','A2'],'B':['B0','B1','B2']},
index = ['K0','K1','K2'])
df2 = pd.DataFrame({'C':['C0','C2','C3'],'D':['D0','D2','D3']},
index=['K0','K2','K3'])
df = pd.merge(df1,df2,left_index=True,right_index=True,how='outer')
df_boys = pd.DataFrame({'k':['K0','K1','K2'],'age':[1,2,3]})
df_girls = pd.DataFrame({'k':['K0','K0','K3'],'age':[4,5,6]})
#差別左邊右邊的age名字的列
df_all = pd.merge(df_boys,df_girls,on='k',suffixes=['_boy','_girl'],how='outer')