# -*- coding: utf-8 -*-
import pandas as pd
from pandas import Series,DataFrame
import numpy as np
string_data=Series(['aa','bb','cc',np.nan])
print(string_data)
# 0 aa
# 1 bb
# 2 cc
# 3 NaN
print(string_data.isnull())
# 0 False
# 1 False
# 2 False
# 3 True
# dtype: bool
#python none值也會當做Na值處理
'''
dropna() 過濾,還可以設定門檻值。調節确實容忍度
fillna() 設定預設值
isnull() 判斷為空
notnull() 判斷不為空
'''
print(string_data[string_data.notnull()])
print(string_data.dropna())
# 0 aa
# 1 bb
# 2 cc
print(string_data.fillna(1))
# 0 aa
# 1 bb
# 2 cc
# 3 1
# dtype: object
data=DataFrame([[1,2,3],[np.nan,2,3],[np.nan,np.nan,np.nan],[np.nan,2,3]])
print(data)
# 0 1 2
# 0 1.0 2.0 3.0
# 1 NaN 2.0 3.0
# 2 NaN NaN NaN
# 3 NaN 2.0 3.0
#過濾行Na的行
print(data.dropna())
# 0 1 2
# 0 1.0 2.0 3.0
#過濾全是Na的行
print(data.dropna(how='all'))
# 0 1.0 2.0 3.0
# 1 NaN 2.0 3.0
# 3 NaN 2.0 3.0
#過濾含Na的列
data[4]=np.nan
print(data)
# 0 1 2 4
# 0 1.0 2.0 3.0 NaN
# 1 NaN 2.0 3.0 NaN
# 2 NaN NaN NaN NaN
# 3 NaN 2.0 3.0 NaN
print(data.dropna(axis=1,how='all'))
# 0 1 2
# 0 1.0 2.0 3.0
# 1 NaN 2.0 3.0
# 2 NaN NaN NaN
# 3 NaN 2.0 3.0
data=DataFrame(np.random.randn(7,3))
print(data)
# 0 1 2
# 0 0.329393 -0.849128 1.864103
# 1 -1.413807 0.461709 1.097039
# 2 0.191843 0.654831 -0.527389
# 3 -1.012322 -0.210649 -0.226674
# 4 0.864600 0.960556 -1.436670
# 5 -1.411272 -0.315413 0.575377
# 6 -0.819563 -0.893195 0.057541
data.ix[:4,1]=np.nan
print(data)
# 0 1 2
# 0 0.051068 NaN 0.333383
# 1 0.793481 NaN -1.240897
# 2 0.705667 NaN 0.797441
# 3 0.088447 NaN 0.059333
# 4 -1.638566 NaN -0.853278
# 5 0.676200 0.151795 -1.329852
# 6 -0.849945 0.955315 -0.526976
data.ix[:2,2]=np.nan
print(data)
# 0 1 2
# 0 -0.429629 NaN NaN
# 1 0.071094 NaN NaN
# 2 0.735083 NaN NaN
# 3 -2.396363 NaN 0.236465
# 4 -2.817603 NaN -0.919750
# 5 -1.031900 0.941620 1.547814
# 6 1.290588 0.116789 0.375252
#過濾前三行
print(data.dropna(thresh=3))
# 0 1 2
# 5 0.559282 0.406619 -0.633786
# 6 0.061291 -0.586182 0.123497
'''
填充缺失資料
'''
print(data.fillna(0))
# 0 1 2
# 0 -0.621390 0.000000 0.000000
# 1 -1.483077 0.000000 0.000000
# 2 -0.948426 0.000000 0.000000
# 3 1.641440 0.000000 2.015218
# 4 -1.036951 0.000000 0.990668
# 5 -0.396387 -0.043747 -0.579406
# 6 -0.272858 -1.523178 -0.494554
#不同列的填充
print(data.fillna({1:0.5,2:-1}))#2列指派0.5,3列渎職-1
# 0 1 2
# 0 -0.857152 0.500000 -1.000000
# 1 -1.054372 0.500000 -1.000000
# 2 0.930643 0.500000 -1.000000
# 3 -1.130063 0.500000 0.240622
# 4 -0.623378 0.500000 0.524232
# 5 0.662496 -0.101754 0.170956
# 6 0.213570 0.864930 -0.383311
#fillna 對原有對象進行更改
_=data.fillna(0,inplace=True)
print(data)
# 0 1 2
# 0 -1.440240 0.000000 0.000000
# 1 0.100231 0.000000 0.000000
# 2 -0.660957 0.000000 0.000000
# 3 0.004898 0.000000 -1.313950
# 4 1.110324 0.000000 -0.276177
# 5 2.536283 -0.294194 -0.706136
# 6 -2.313634 -0.270051 0.295415
#reindex 插值的方法
df=DataFrame(np.random.randn(6,3))
df.ix[2:,1]=np.nan
df.ix[4:,2]=np.nan
print(df)
# 0 1 2
# 0 -1.292891 0.977053 -1.339258
# 1 -0.981534 0.643460 -0.699660
# 2 -0.343731 NaN 0.812251
# 3 0.446141 NaN -0.824229
# 4 0.389609 NaN NaN
# 5 0.716714 NaN NaN
print(df.fillna(method='ffill'))
# 0 1 2
# 0 -0.672379 0.088150 -0.765589
# 1 0.225561 1.370398 -1.211027
# 2 0.379040 1.370398 -1.009322
# 3 -0.388188 1.370398 -0.986014
# 4 0.387574 1.370398 -0.986014
# 5 -0.196254 1.370398 -0.986014
print(df.fillna(method='ffill',limit=2))
# 0 1 2
# 0 0.816393 0.031747 1.354395
# 1 -0.940994 0.093215 0.837312
# 2 -0.163731 0.093215 0.349830
# 3 0.268226 0.093215 -0.762212
# 4 -2.650622 NaN -0.762212
# 5 -1.195725 NaN -0.762212
#在Na值中插入平均數
data=Series([1,np.nan,2,np.nan,3])
print(data.fillna(data.mean()))
# 0 1.0
# 1 2.0
# 2 2.0
# 3 2.0
# 4 3.0
# dtype: float64
'''
value 預設填充值
method 預設是ffill向下填充
axis 行填充 預設是列
inplace 不産生副本
limit 限定填充數量
'''