天天看点

python数据分析第二版阮敬ppt_利用Python进行数据分析第二版复现(十三)_2

import pandas as pd

pd.options.display.max_rows = 10

unames = ['user_id', 'gender', 'age', 'occupation', 'zip']

users = pd.read_table('datasets/movielens/users.dat', sep='::',

header=None, names=unames)

rnames = ['user_id', 'movie_id', 'rating', 'timestamp']

ratings = pd.read_table('datasets/movielens/ratings.dat', sep='::',

header=None, names=rnames)

mnames = ['movie_id', 'title', 'genres']

movies = pd.read_table('datasets/movielens/movies.dat', sep='::',

header=None, names=mnames)

E:\anaconda\lib\site-packages\ipykernel_launcher.py:7: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.

import sys

E:\anaconda\lib\site-packages\ipykernel_launcher.py:11: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.

# This is added back by InteractiveShellApp.init_path()

E:\anaconda\lib\site-packages\ipykernel_launcher.py:14: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.

print(users[:5])

user_id gender age occupation zip

0 1 F 1 10 48067

1 2 M 56 16 70072

2 3 M 25 15 55117

3 4 M 45 7 02460

4 5 M 25 20 55455

print(ratings[:5]

)

user_id movie_id rating timestamp

0 1 1193 5 978300760

1 1 661 3 978302109

2 1 914 3 978301968

3 1 3408 4 978300275

4 1 2355 5 978824291

print(movies[:5])

movie_id title genres

0 1 Toy Story (1995) Animation|Children's|Comedy

1 2 Jumanji (1995) Adventure|Children's|Fantasy

2 3 Grumpier Old Men (1995) Comedy|Romance

3 4 Waiting to Exhale (1995) Comedy|Drama

4 5 Father of the Bride Part II (1995) Comedy

print(ratings)

user_id movie_id rating timestamp

0 1 1193 5 978300760

1 1 661 3 978302109

2 1 914 3 978301968

3 1 3408 4 978300275

4 1 2355 5 978824291

... ... ... ... ...

1000204 6040 1091 1 956716541

1000205 6040 1094 5 956704887

1000206 6040 562 5 956704746

1000207 6040 1096 4 956715648

1000208 6040 1097 4 956715569

[1000209 rows x 4 columns]

data = pd.merge(pd.merge(ratings, users), movies)

#pandas的merge可以把多个表合并在一起。

print(data)

user_id movie_id rating timestamp gender age occupation zip \

0 1 1193 5 978300760 F 1 10 48067

1 2 1193 5 978298413 M 56 16 70072

2 12 1193 4 978220179 M 25 12 32793

3 15 1193 4 978199279 M 25 7 22903

4 17 1193 5 978158471 M 50 1 95350

... ... ... ... ... ... ... ... ...

1000204 5949 2198 5 958846401 M 18 17 47901

1000205 5675 2703 3 976029116 M 35 14 30030

1000206 5780 2845 1 958153068 M 18 17 92886

1000207 5851 3607 5 957756608 F 18 20 55410

1000208 5938 2909 4 957273353 M 25 1 35401

title genres

0 One Flew Over the Cuckoo's Nest (1975) Drama

1 One Flew Over the Cuckoo's Nest (1975) Drama

2 One Flew Over the Cuckoo's Nest (1975) Drama

3 One Flew Over the Cuckoo's Nest (1975) Drama

4 One Flew Over the Cuckoo's Nest (1975) Drama

... ... ...

1000204 Modulations (1998) Documentary

1000205 Broken Vessels (1998) Drama

1000206 White Boys (1999) Drama

1000207 One Little Indian (1973) Comedy|Drama|Western

1000208 Five Wives, Three Secretaries and Me (1998) Documentary

[1000209 rows x 10 columns]

data.iloc[0]

user_id 1

movie_id 1193

rating 5

timestamp 978300760

gender F

age 1

occupation 10

zip 48067

title One Flew Over the Cuckoo's Nest (1975)

genres Drama

Name: 0, dtype: object

#用pivot_table方法可以添加参数计算相关的统计量

mean_ratings = data.pivot_table('rating', index='title',

columns='gender', aggfunc='mean')

print(mean_ratings[:5])

gender F M

title

$1,000,000 Duck (1971) 3.375000 2.761905

'Night Mother (1986) 3.388889 3.352941

'Til There Was You (1997) 2.675676 2.733333

'burbs, The (1989) 2.793478 2.962085

...And Justice for All (1979) 3.828571 3.689024

#在书中的例子中,去除了评论数少于250条的数据

ratings_by_title = data.groupby('title').size()#数据分组

active_titles = ratings_by_title.index[ratings_by_title >= 250]#数据判别

mean_ratings = mean_ratings.loc[active_titles]#子集在总计算中选取就好了

top_female_ratings = mean_ratings.sort_values(by='F', ascending=False)

#根据F(女生)列进行降序排列

print(top_female_ratings[:10])

gender F M

title

Close Shave, A (1995) 4.644444 4.473795

Wrong Trousers, The (1993) 4.588235 4.478261

Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) 4.572650 4.464589

Wallace & Gromit: The Best of Aardman Animation... 4.563107 4.385075

Schindler's List (1993) 4.562602 4.491415

Shawshank Redemption, The (1994) 4.539075 4.560625

Grand Day Out, A (1992) 4.537879 4.293255

To Kill a Mockingbird (1962) 4.536667 4.372611

Creature Comforts (1990) 4.513889 4.272277

Usual Suspects, The (1995) 4.513317 4.518248