天天看點

python連續資料離散化_Python連續資料離散化處理和pandas.cut函數用法

import numpy as np

import pandas as pd

from pandas import Series, DataFrame

np.random.seed(666)

score_list = np.random.randint(25, 100, size=20)

print(score_list)

# [27 70 55 87 95 98 55 61 86 76 85 53 39 88 41 71 64 94 38 94]

# 指定多個區間

bins = [0, 59, 70, 80, 100]

score_cut = pd.cut(score_list, bins)

print(type(score_cut)) #

print(score_cut)

'''

[(0, 59], (59, 70], (0, 59], (80, 100], (80, 100], ..., (70, 80], (59, 70], (80, 100], (0, 59], (80, 100]]

Length: 20

Categories (4, interval[int64]): [(0, 59] < (59, 70] < (70, 80] < (80, 100]]

'''

print(pd.value_counts(score_cut)) # 統計每個區間人數

'''

(80, 100] 8

(0, 59] 7

(59, 70] 3

(70, 80] 2

dtype: int64

'''

df = DataFrame()

df['score'] = score_list

df['student'] = [pd.util.testing.rands(3) for i in range(len(score_list))]

print(df)

'''

score student

0 27 1ul

1 70 yuK

2 55 WWK

3 87 EU6

4 95 Vqn

5 98 KAf

6 55 QNT

7 61 HaE

8 86 aBo

9 76 MMa

10 85 Ctc

11 53 5BI

12 39 wBp

13 88 WMB

14 41 q5t

15 71 MjZ

16 64 nTc

17 94 Kyx

18 38 Rlh

19 94 2uV

'''

# 使用cut方法進行分箱

print(pd.cut(df['score'], bins))

'''

0 (0, 59]

1 (59, 70]

2 (0, 59]

3 (80, 100]

4 (80, 100]

5 (80, 100]

6 (0, 59]

7 (59, 70]

8 (80, 100]

9 (70, 80]

10 (80, 100]

11 (0, 59]

12 (0, 59]

13 (80, 100]

14 (0, 59]

15 (70, 80]

16 (59, 70]

17 (80, 100]

18 (0, 59]

19 (80, 100]

Name: score, dtype: category

Categories (4, interval[int64]): [(0, 59] < (59, 70] < (70, 80] < (80, 100]]

'''

df['Categories'] = pd.cut(df['score'], bins)

print(df)

'''

score student Categories

0 27 1ul (0, 59]

1 70 yuK (59, 70]

2 55 WWK (0, 59]

3 87 EU6 (80, 100]

4 95 Vqn (80, 100]

5 98 KAf (80, 100]

6 55 QNT (0, 59]

7 61 HaE (59, 70]

8 86 aBo (80, 100]

9 76 MMa (70, 80]

10 85 Ctc (80, 100]

11 53 5BI (0, 59]

12 39 wBp (0, 59]

13 88 WMB (80, 100]

14 41 q5t (0, 59]

15 71 MjZ (70, 80]

16 64 nTc (59, 70]

17 94 Kyx (80, 100]

18 38 Rlh (0, 59]

19 94 2uV (80, 100]

'''

# 但是這樣的方法不是很适合閱讀,可以使用cut方法中的label參數

# 為每個區間指定一個label

df['Categories'] = pd.cut(df['score'], bins, labels=['low', 'middle', 'good', 'perfect'])

print(df)

'''

score student Categories

0 27 1ul low

1 70 yuK middle

2 55 WWK low

3 87 EU6 perfect

4 95 Vqn perfect

5 98 KAf perfect

6 55 QNT low

7 61 HaE middle

8 86 aBo perfect

9 76 MMa good

10 85 Ctc perfect

11 53 5BI low

12 39 wBp low

13 88 WMB perfect

14 41 q5t low

15 71 MjZ good

16 64 nTc middle

17 94 Kyx perfect

18 38 Rlh low

19 94 2uV perfect

'''