数据分析 (电影数据)
import pandas as pduname = [‘user_id‘, ‘gender‘, ‘age‘, ‘occupation‘, ‘zip‘]fuser = ‘//home//yunpiao//data/1M//users.dat‘fmovie = ‘/home/yunpiao/data/1M/movies.dat‘fratings = ‘/home/yunpiao/data/1M/ratings.dat‘pusers = pd.read_table(fuser, sep=‘::‘, header=None, names=uname, engine=‘python‘)uname = [‘user_id‘,‘movie_id‘, ‘rating‘, ‘timestamp‘]prating = pd.read_table(fratings, sep=‘::‘, header=None, names=uname, engine=‘python‘)uname = [‘movie_id‘, ‘title‘, ‘genres‘]%timeit pmovie = pd.read_table(fmovie, sep=‘::‘, header=None, names=uname,engine=‘python‘)
100 loops, best of 3: 11.5 ms per loop
切片
pusers[:5]
| user_id | gender | age | occupation | zip |
0 | 1 | F | 1 | 10 | 48067 |
1 | 2 | M | 56 | 16 | 70072 |
2 | 3 | M | 25 | 15 | 55117 |
3 | 4 | M | 45 | 7 | 02460 |
4 | 5 | M | 25 | 20 | 55455 |
prating[:5]
| user_id | movie_id | rating | timestamp |
0 | 1 | 1193 | 5 | 978300760 |
1 | 1 | 661 | 3 | 978302109 |
2 | 1 | 914 | 3 | 978301968 |
3 | 1 | 3408 | 4 | 978300275 |
4 | 1 | 2355 | 5 | 978824291 |
pmovie[1:10:4]
| movie_id | title | genres |
1 | 2 | Jumanji (1995) | Adventure|Children‘s|Fantasy |
5 | 6 | Heat (1995) | Action|Crime|Thriller |
9 | 10 | GoldenEye (1995) | Action|Adventure|Thriller |
data = http://www.mamicode.com/pd.merge(pd.merge(prating,pusers),pmovie)>
user_id 19movie_id 1193rating 5timestamp 982730936gender Mage 1occupation 10zip 48073title One Flew Over the Cuckoo‘s Nest (1975)genres DramaName: 6, dtype: object
mean_ratings = data.pivot_table(‘rating‘,index=‘title‘, columns=‘gender‘, aggfunc=‘mean‘)mean_ratings[:5]
gender | F | M |
title | | |
$1,000,000 Duck (1971) | 3.375000 | 2.761905 |
‘Night Mother (1986) | 3.388889 | 3.352941 |
‘Til There Was You (1997) | 2.675676 | 2.733333 |
‘burbs, The (1989) | 2.793478 | 2.962085 |
...And Justice for All (1979) | 3.828571 | 3.689024 |
rating_by_title = data.groupby(‘title‘).size()rating_by_title[:4]
title$1,000,000 Duck (1971) 37‘Night Mother (1986) 70‘Til There Was You (1997) 52‘burbs, The (1989) 303dtype: int64
active_title = rating_by_title.index[rating_by_title >= 250]print(active_title)
Index([u‘‘burbs, The (1989)‘, u‘10 Things I Hate About You (1999)‘, u‘101 Dalmatians (1961)‘, u‘101 Dalmatians (1996)‘, u‘12 Angry Men (1957)‘, u‘13th Warrior, The (1999)‘, u‘2 Days in the Valley (1996)‘, u‘20,000 Leagues Under the Sea (1954)‘, u‘2001: A Space Odyssey (1968)‘, u‘2010 (1984)‘, ... u‘X-Men (2000)‘, u‘Year of Living Dangerously (1982)‘, u‘Yellow Submarine (1968)‘, u‘You‘ve Got Mail (1998)‘, u‘Young Frankenstein (1974)‘, u‘Young Guns (1988)‘, u‘Young Guns II (1990)‘, u‘Young Sherlock Holmes (1985)‘, u‘Zero Effect (1998)‘, u‘eXistenZ (1999)‘], dtype=‘object‘, name=u‘title‘, length=1216)
mean_ratings = mean_ratings.ix[active_title]mean_ratings[:3]
gender | F | M |
title | | |
‘burbs, The (1989) | 2.793478 | 2.962085 |
10 Things I Hate About You (1999) | 3.646552 | 3.311966 |
101 Dalmatians (1961) | 3.791444 | 3.500000 |
top_demale_ratings = mean_ratings.sort_values(by=‘M‘,ascending=False)top_demale_ratings[‘M‘][:3]
titleGodfather, The (1972) 4.583333Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) 4.576628Shawshank Redemption, The (1994) 4.560625Name: M, dtype: float64
mean_ratings[‘diff‘] = mean_ratings[‘M‘] - mean_ratings[‘F‘]mean_ratings[:5]
gender | F | M | diff |
title | | | |
‘burbs, The (1989) | 2.793478 | 2.962085 | 0.168607 |
10 Things I Hate About You (1999) | 3.646552 | 3.311966 | -0.334586 |
101 Dalmatians (1961) | 3.791444 | 3.500000 | -0.291444 |
101 Dalmatians (1996) | 3.240000 | 2.911215 | -0.328785 |
12 Angry Men (1957) | 4.184397 | 4.328421 | 0.144024 |
top_diff = mean_ratings.sort_values(by="diff", ascending=False)top_diff[:4:1]
gender | F | M | diff |
title | | | |
Good, The Bad and The Ugly, The (1966) | 3.494949 | 4.221300 | 0.726351 |
Kentucky Fried Movie, The (1977) | 2.878788 | 3.555147 | 0.676359 |
Dumb & Dumber (1994) | 2.697987 | 3.336595 | 0.638608 |
Longest Day, The (1962) | 3.411765 | 4.031447 | 0.619682 |
rating_std_by_title = data.groupby(‘title‘)[‘rating‘].std()rating_std_by_title = rating_std_by_title.ix[active_title]rating_std_by_title.sort_values(ascending=False)[:10]
titleDumb & Dumber (1994) 1.321333Blair Witch Project, The (1999) 1.316368Natural Born Killers (1994) 1.307198Tank Girl (1995) 1.277695Rocky Horror Picture Show, The (1975) 1.260177Eyes Wide Shut (1999) 1.259624Evita (1996) 1.253631Billy Madison (1995) 1.249970Fear and Loathing in Las Vegas (1998) 1.246408Bicentennial Man (1999) 1.245533Name: rating, dtype: float64
111