首页 > 代码库 > 111

111

数据分析 (电影数据)

import pandas as pduname = [‘user_id‘, ‘gender‘, ‘age‘, ‘occupation‘, ‘zip‘]fuser  = ‘//home//yunpiao//data/1M//users.dat‘fmovie = ‘/home/yunpiao/data/1M/movies.dat‘fratings = ‘/home/yunpiao/data/1M/ratings.dat‘pusers = pd.read_table(fuser, sep=‘::‘, header=None, names=uname, engine=‘python‘)uname = [‘user_id‘,‘movie_id‘, ‘rating‘, ‘timestamp‘]prating = pd.read_table(fratings, sep=‘::‘, header=None, names=uname, engine=‘python‘)uname = [‘movie_id‘, ‘title‘, ‘genres‘]%timeit pmovie = pd.read_table(fmovie, sep=‘::‘, header=None, names=uname,engine=‘python‘)
100 loops, best of 3: 11.5 ms per loop

切片

pusers[:5]
user_id gender age occupation zip
0 1 F 1 10 48067
1 2 M 56 16 70072
2 3 M 25 15 55117
3 4 M 45 7 02460
4 5 M 25 20 55455
prating[:5]
user_id movie_id rating timestamp
0 1 1193 5 978300760
1 1 661 3 978302109
2 1 914 3 978301968
3 1 3408 4 978300275
4 1 2355 5 978824291
pmovie[1:10:4]
movie_id title genres
1 2 Jumanji (1995) Adventure|Children‘s|Fantasy
5 6 Heat (1995) Action|Crime|Thriller
9 10 GoldenEye (1995) Action|Adventure|Thriller
data = http://www.mamicode.com/pd.merge(pd.merge(prating,pusers),pmovie)>
user_id                                           19movie_id                                        1193rating                                             5timestamp                                  982730936gender                                             Mage                                                1occupation                                        10zip                                            48073title         One Flew Over the Cuckoo‘s Nest (1975)genres                                         DramaName: 6, dtype: object
mean_ratings = data.pivot_table(‘rating‘,index=‘title‘, columns=‘gender‘, aggfunc=‘mean‘)mean_ratings[:5]
gender F M
title
$1,000,000 Duck (1971) 3.375000 2.761905
‘Night Mother (1986) 3.388889 3.352941
‘Til There Was You (1997) 2.675676 2.733333
‘burbs, The (1989) 2.793478 2.962085
...And Justice for All (1979) 3.828571 3.689024
rating_by_title = data.groupby(‘title‘).size()rating_by_title[:4]
title$1,000,000 Duck (1971)        37‘Night Mother (1986)          70‘Til There Was You (1997)     52‘burbs, The (1989)           303dtype: int64
active_title = rating_by_title.index[rating_by_title >= 250]print(active_title)
Index([u‘‘burbs, The (1989)‘, u‘10 Things I Hate About You (1999)‘,       u‘101 Dalmatians (1961)‘, u‘101 Dalmatians (1996)‘,       u‘12 Angry Men (1957)‘, u‘13th Warrior, The (1999)‘,       u‘2 Days in the Valley (1996)‘, u‘20,000 Leagues Under the Sea (1954)‘,       u‘2001: A Space Odyssey (1968)‘, u‘2010 (1984)‘,       ...       u‘X-Men (2000)‘, u‘Year of Living Dangerously (1982)‘,       u‘Yellow Submarine (1968)‘, u‘You‘ve Got Mail (1998)‘,       u‘Young Frankenstein (1974)‘, u‘Young Guns (1988)‘,       u‘Young Guns II (1990)‘, u‘Young Sherlock Holmes (1985)‘,       u‘Zero Effect (1998)‘, u‘eXistenZ (1999)‘],      dtype=‘object‘, name=u‘title‘, length=1216)
mean_ratings = mean_ratings.ix[active_title]mean_ratings[:3]
gender F M
title
‘burbs, The (1989) 2.793478 2.962085
10 Things I Hate About You (1999) 3.646552 3.311966
101 Dalmatians (1961) 3.791444 3.500000
top_demale_ratings = mean_ratings.sort_values(by=‘M‘,ascending=False)top_demale_ratings[‘M‘][:3]
titleGodfather, The (1972)                                                  4.583333Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)    4.576628Shawshank Redemption, The (1994)                                       4.560625Name: M, dtype: float64
mean_ratings[‘diff‘] = mean_ratings[‘M‘] - mean_ratings[‘F‘]mean_ratings[:5]
gender F M diff
title
‘burbs, The (1989) 2.793478 2.962085 0.168607
10 Things I Hate About You (1999) 3.646552 3.311966 -0.334586
101 Dalmatians (1961) 3.791444 3.500000 -0.291444
101 Dalmatians (1996) 3.240000 2.911215 -0.328785
12 Angry Men (1957) 4.184397 4.328421 0.144024
top_diff = mean_ratings.sort_values(by="diff", ascending=False)top_diff[:4:1]
gender F M diff
title
Good, The Bad and The Ugly, The (1966) 3.494949 4.221300 0.726351
Kentucky Fried Movie, The (1977) 2.878788 3.555147 0.676359
Dumb & Dumber (1994) 2.697987 3.336595 0.638608
Longest Day, The (1962) 3.411765 4.031447 0.619682
rating_std_by_title = data.groupby(‘title‘)[‘rating‘].std()rating_std_by_title = rating_std_by_title.ix[active_title]rating_std_by_title.sort_values(ascending=False)[:10]
titleDumb & Dumber (1994)                     1.321333Blair Witch Project, The (1999)          1.316368Natural Born Killers (1994)              1.307198Tank Girl (1995)                         1.277695Rocky Horror Picture Show, The (1975)    1.260177Eyes Wide Shut (1999)                    1.259624Evita (1996)                             1.253631Billy Madison (1995)                     1.249970Fear and Loathing in Las Vegas (1998)    1.246408Bicentennial Man (1999)                  1.245533Name: rating, dtype: float64

111