天天看點

python資料分析第二版阮敬ppt_利用Python進行資料分析第二版複現(十三)_2

import pandas as pd

pd.options.display.max_rows = 10

unames = ['user_id', 'gender', 'age', 'occupation', 'zip']

users = pd.read_table('datasets/movielens/users.dat', sep='::',

header=None, names=unames)

rnames = ['user_id', 'movie_id', 'rating', 'timestamp']

ratings = pd.read_table('datasets/movielens/ratings.dat', sep='::',

header=None, names=rnames)

mnames = ['movie_id', 'title', 'genres']

movies = pd.read_table('datasets/movielens/movies.dat', sep='::',

header=None, names=mnames)

E:\anaconda\lib\site-packages\ipykernel_launcher.py:7: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.

import sys

E:\anaconda\lib\site-packages\ipykernel_launcher.py:11: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.

# This is added back by InteractiveShellApp.init_path()

E:\anaconda\lib\site-packages\ipykernel_launcher.py:14: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.

print(users[:5])

user_id gender age occupation zip

0 1 F 1 10 48067

1 2 M 56 16 70072

2 3 M 25 15 55117

3 4 M 45 7 02460

4 5 M 25 20 55455

print(ratings[:5]

)

user_id movie_id rating timestamp

0 1 1193 5 978300760

1 1 661 3 978302109

2 1 914 3 978301968

3 1 3408 4 978300275

4 1 2355 5 978824291

print(movies[:5])

movie_id title genres

0 1 Toy Story (1995) Animation|Children's|Comedy

1 2 Jumanji (1995) Adventure|Children's|Fantasy

2 3 Grumpier Old Men (1995) Comedy|Romance

3 4 Waiting to Exhale (1995) Comedy|Drama

4 5 Father of the Bride Part II (1995) Comedy

print(ratings)

user_id movie_id rating timestamp

0 1 1193 5 978300760

1 1 661 3 978302109

2 1 914 3 978301968

3 1 3408 4 978300275

4 1 2355 5 978824291

... ... ... ... ...

1000204 6040 1091 1 956716541

1000205 6040 1094 5 956704887

1000206 6040 562 5 956704746

1000207 6040 1096 4 956715648

1000208 6040 1097 4 956715569

[1000209 rows x 4 columns]

data = pd.merge(pd.merge(ratings, users), movies)

#pandas的merge可以把多個表合并在一起。

print(data)

user_id movie_id rating timestamp gender age occupation zip \

0 1 1193 5 978300760 F 1 10 48067

1 2 1193 5 978298413 M 56 16 70072

2 12 1193 4 978220179 M 25 12 32793

3 15 1193 4 978199279 M 25 7 22903

4 17 1193 5 978158471 M 50 1 95350

... ... ... ... ... ... ... ... ...

1000204 5949 2198 5 958846401 M 18 17 47901

1000205 5675 2703 3 976029116 M 35 14 30030

1000206 5780 2845 1 958153068 M 18 17 92886

1000207 5851 3607 5 957756608 F 18 20 55410

1000208 5938 2909 4 957273353 M 25 1 35401

title genres

0 One Flew Over the Cuckoo's Nest (1975) Drama

1 One Flew Over the Cuckoo's Nest (1975) Drama

2 One Flew Over the Cuckoo's Nest (1975) Drama

3 One Flew Over the Cuckoo's Nest (1975) Drama

4 One Flew Over the Cuckoo's Nest (1975) Drama

... ... ...

1000204 Modulations (1998) Documentary

1000205 Broken Vessels (1998) Drama

1000206 White Boys (1999) Drama

1000207 One Little Indian (1973) Comedy|Drama|Western

1000208 Five Wives, Three Secretaries and Me (1998) Documentary

[1000209 rows x 10 columns]

data.iloc[0]

user_id 1

movie_id 1193

rating 5

timestamp 978300760

gender F

age 1

occupation 10

zip 48067

title One Flew Over the Cuckoo's Nest (1975)

genres Drama

Name: 0, dtype: object

#用pivot_table方法可以添加參數計算相關的統計量

mean_ratings = data.pivot_table('rating', index='title',

columns='gender', aggfunc='mean')

print(mean_ratings[:5])

gender F M

title

$1,000,000 Duck (1971) 3.375000 2.761905

'Night Mother (1986) 3.388889 3.352941

'Til There Was You (1997) 2.675676 2.733333

'burbs, The (1989) 2.793478 2.962085

...And Justice for All (1979) 3.828571 3.689024

#在書中的例子中,去除了評論數少于250條的資料

ratings_by_title = data.groupby('title').size()#資料分組

active_titles = ratings_by_title.index[ratings_by_title >= 250]#資料判别

mean_ratings = mean_ratings.loc[active_titles]#子集在總計算中選取就好了

top_female_ratings = mean_ratings.sort_values(by='F', ascending=False)

#根據F(女生)列進行降序排列

print(top_female_ratings[:10])

gender F M

title

Close Shave, A (1995) 4.644444 4.473795

Wrong Trousers, The (1993) 4.588235 4.478261

Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) 4.572650 4.464589

Wallace & Gromit: The Best of Aardman Animation... 4.563107 4.385075

Schindler's List (1993) 4.562602 4.491415

Shawshank Redemption, The (1994) 4.539075 4.560625

Grand Day Out, A (1992) 4.537879 4.293255

To Kill a Mockingbird (1962) 4.536667 4.372611

Creature Comforts (1990) 4.513889 4.272277

Usual Suspects, The (1995) 4.513317 4.518248