天天看點

推薦引擎:基于餘弦相似度書籍推薦Python實作

# -*- coding: utf-8 -*-

# @Date    : 2019-02-14
# @Author  : Peng Shiyu

from copy import deepcopy

import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 資料準備:{書名: 評分}
# user = {"紅樓夢", "西遊記", "水浒傳", "三國演義"}

user1 = {"紅樓夢": 4, "西遊記": 3}
user2 = {"紅樓夢": 5, "西遊記": 6, "水浒傳": 3}
user3 = {"紅樓夢": 4, "西遊記": 3, "三國演義": 5}
user4 = {"西遊記": 4, "三國演義": 5}

data = [
    user1,
    user2,
    user3,
    user4
]

# 特征提取
dict_vectorizer = DictVectorizer(dtype=np.int32, sparse=False)
result = dict_vectorizer.fit_transform(data)
books = dict_vectorizer.get_feature_names()
print(dict_vectorizer.get_feature_names())
print(result)

# 餘弦相似度矩陣
user_similarity = cosine_similarity(result)
print(user_similarity)

for user_id, user_looked in enumerate(data):
    user_suggest = user_similarity[user_id].tolist()

    # 找到與之相似度最高的兩個人
    user_suggest_bak = deepcopy(user_suggest)
    user_suggest_bak.sort(reverse=True)
    max_similar = user_suggest_bak[1: 3]
    print(max_similar)
    max_index = list(map(user_suggest.index, max_similar))
    print(max_index)

    suggest = {}
    for index, user in enumerate([data[i] for i in max_index]):
        for key, value in user.items():
            if key not in user_looked:
                suggest[key] = user_suggest[index] * value

    print(suggest)
"""
['三國演義', '水浒傳', '紅樓夢', '西遊記']
[[0 0 4 3]
 [0 3 5 6]
 [5 0 4 3]
 [5 0 0 4]]
 
[[1.         0.90837374 0.70710678 0.37481703]
 [0.90837374 1.         0.64231723 0.44799204]
 [0.70710678 0.64231723 1.         0.81719329]
 [0.37481703 0.44799204 0.81719329 1.        ]]
 
[0.9083737430941391, 0.7071067811865475]
{'水浒傳': 3.0, '三國演義': 4.541868715470695}

[0.9083737430941391, 0.6423172335936725]
{'三國演義': 4.999999999999999}

[0.8171932929538644, 0.7071067811865475]
{}

[0.8171932929538644, 0.44799203576793445]
{'紅樓夢': 2.2399601788396724, '水浒傳': 1.3439761073038032}

"""      

參考:

推薦算法和機器學習系列 - 協同過濾推薦算法和餘弦相似性算法