- 基于使用者的協同過濾算法在使用者增長的時候,相似度計算的計算會越來越困難。基于物品的算法給使用者推薦他們之前喜歡的物品相似的物品。 算法步驟
- 計算物品之間的相似度
-
根據物品的相似度和使用者的曆史行為給使用者生成推薦清單
相似度公式如下:
wij=|N(i)∩N(j)||N(i)||N(j)|−−−−−−−−−−√
該公式減輕了熱門物品和很多物品相似的可能性。在計算相似度的時候,建立倒排表。相似度的計算與基于使用者的相同,不再贅述。
計算相似度之後,u對物品j的興趣如下:
puj=∑i∈N(u)∩S(j,k)wjirui
其中N(u)是使用者喜歡的物品的集合,S(j,k)是和物品J最相似的K個物品的集合,wji是物品i和j的相似度。
使用者活躍度對物品相似度的影響
考慮到并不是每個使用者對推薦的貢獻度是一樣的,例如某書店店主買了當當上的80%的書存貨,這80%的書都會産生關聯,但是并不是根據使用者的興趣愛好關聯的。我們要降低這個使用者的貢獻度,引入IUF(inverse user frequence),即使用者活躍度對數的倒數。利用IUF修正物品相似度的計算:
wij=∑u∈N(i)∩N(j)1log(1+|N(u)|)|N(i)||N(j)|−−−−−−−−−−√
物品相似度歸一化
Karypis在研究中心發現如果将相似度矩陣按照最大值歸一化會提高推薦的準确率。即
wij,=wijmax(wij)
相似度的歸一化可以提高推薦的多樣性和覆寫率。
完整代碼如下
import random
import sys
import math
import os
from operator import itemgetter
random.seed()
class ItemBasedCF(object):
def __init__(self):
self.trainset = {}
self.testset = {}
self.n_sim_movie =
self.n_rec_movie =
self.movie_sim_mat = {}
self.movie_popular = {}
self.movie_count =
print('Similar movie number = %d' % self.n_sim_movie, file = sys.stderr)
print('Recommendend movie number = %d' % self.n_rec_movie,file = sys.stderr)
@staticmethod
def loadfile(filename):
fp = open(filename, 'r')
for i, line in enumerate(fp):
yield line.strip('\r\n')
if i % == :
print ('load %s(%s)' %(filename,i), file = sys.stderr)
fp.close()
print('load %s succ' %filename, file = sys.stderr)
def generate_dataset(self, filename, pivot = ):
trainset_len =
testset_len =
for line in self.loadfile(filename):
user, movie, rating , _= line.split('::')
if random.random() < pivot:
self.trainset.setdefault(user,{})
self.trainset[user][movie] = int(rating)
trainset_len +=
else:
self.testset.setdefault(user,{})
self.testset[user][movie] = int(rating)
testset_len +=
print('split succ , trainset is %d , testset is %d' %(trainset_len,testset_len) , file = sys.stderr)
def calc_movie_sim(self):
for user, movies in self.trainset.items():
for movie in movies:
if movie not in self.movie_popular:
self.movie_popular[movie] =
self.movie_popular[movie] +=
print('count movies number and pipularity succ',file = sys.stderr)
self.movie_count = len(self.movie_popular)
print('total movie number = %d' %self.movie_count, file = sys.stderr)
itemsim_mat = self.movie_sim_mat
print('building co-rated users matrix', file = sys.stderr)
for user, movies in self.trainset.items():
for m1 in movies:
for m2 in movies:
if m1 == m2:
continue
itemsim_mat.setdefault(m1,{})
itemsim_mat[m1].setdefault(m2,)
itemsim_mat[m1][m2] +=
print('build co-rated users matrix succ', file = sys.stderr)
print('calculating movie similarity matrix', file = sys.stderr)
simfactor_count =
PRINT_STEP =
for m1, related_movies in itemsim_mat.items():
for m2, count in related_movies.items():
itemsim_mat[m1][m2] = count / math.sqrt(self.movie_popular[m1] * self.movie_popular[m2])
simfactor_count +=
if simfactor_count % PRINT_STEP == :
print('calcu movie similarity factor(%d)' %simfactor_count, file = sys.stderr)
print('calcu similiarity succ', file = sys.stderr)
def recommend(self,user):
K = self.n_sim_movie
N = self.n_rec_movie
rank = {}
watched_movies = self.trainset[user]
for movie, rating in watched_movies.items():
for related_movie, similarity_factor in sorted(self.movie_sim_mat[movie].items(), key=itemgetter(),
reverse=True)[:K]:
if related_movie in watched_movies:
continue
rank.setdefault(related_movie, )
rank[related_movie] += similarity_factor * rating
return sorted(rank.items(), key=itemgetter(), reverse=True)[:N]
def evaluate(self):
print('evaluation start', file = sys.stderr)
N = self.n_rec_movie
hit =
rec_count =
test_count =
all_rec_movies = set()
popular_sum =
for i, user in enumerate(self.trainset):
if i % == :
print('recommend for %d users ' %i , file = sys.stderr)
test_movies = self.testset.get(user,{})
rec_movies = self.recommend(user)
for movie, _ in rec_movies:
if movie in test_movies:
hit +=
all_rec_movies.add(movie)
popular_sum += math.log( + self.movie_popular[movie])
rec_count += N
test_count += len(test_movies)
precision = hit / ( * rec_count)
recall = hit / ( * test_count)
coverage = len(all_rec_movies) / ( * self.movie_count)
popularity = popular_sum / ( * rec_count)
print('precision is %.4f\t recall is %.4f \t coverage is %.4f \t popularity is %.4f'
%(precision,recall,coverage,popularity), file = sys.stderr)
if __name__ == '__main__':
ratingfile = os.path.join('ml-1m', 'ratings.dat')
itemcf = ItemBasedCF()
itemcf.generate_dataset(ratingfile)
itemcf.calc_movie_sim()
itemcf.evaluate()