0%

LFM(Latent Factor Model)

召回算法LFM推导及自编程实现。

LFM推导

建模公式:

image-20210517190307815

损失函数:

image-20210517190535310

求导:

image-20210517190555052

算法迭代:

image-20210517190623574

LFM应用场景举例

得到user_vec和item_vec后可用于多种场景

  • 计算用户toplike
  • 计算item的topsim
  • 计算item的topic

算法实现

数据集

movies, rating dataset链接

rating包含用户对电影的评分,生成训练样本时,设置评分的阈值为4.0,当评分>=4.0时为正样本,否则为负样本。负采样使正负样本数量相同。负采样时首先计算item的平均评分,然后将负样本集合按照平均评分从高到底排序,选取前n个作为负样本。原因是如果用户对一个平均评分高的item打了低分,说明该用户很大可能不喜欢这个item。(针对其他类型的行为数据,如CTR行为等,可以采用转换率等方式进行负采样)

负样本选择

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# -*-coding:utf8-*-
"""
author: fivelike
date: 2021.5.17
util function
"""

import os


def get_item_info(input_file):
"""
get item info: [title, genre]
:param input_file: item info file
:return:
a dict: key: itemid, value: [title, genre]
"""

if not os.path.exists(input_file):
return {}
item_info = {}
linenum = 0
fp = open(input_file, encoding="utf-8")
for line in fp:
if linenum == 0:
linenum += 1
continue
item = line.strip().split(',')
if len(item) < 3:
continue
elif len(item) == 3:
itemid, title, genre = item[0], item[1], item[2]
elif len(item) > 3:
itemid = item[0]
genre = item[-1]
title = ",".join(item[1: -1])
item_info[itemid] = [title,genre]
fp.close()
return item_info

def get_ave_score(input_file):
"""
get item ave rating score
:param input_file: user rating file
:return:
a dict: key: itemid, value: ave_score
"""
if not os.path.exists(input_file):
return {}
linenum = 0
record_dict = {}
score_dict = {}
fp = open(input_file, encoding="utf-8")
for line in fp:
if linenum == 0:
linenum += 1
continue
item = line.strip().split(',')
if len(item) < 4:
continue
userid, itemid, rating = item[0], item[1], float(item[2])
if itemid not in record_dict:
record_dict[itemid] = [0,0]
record_dict[itemid][0]+=1
record_dict[itemid][1]+=rating
fp.close()
for itemid in record_dict:
score_dict[itemid] = round(record_dict[itemid][1]/record_dict[itemid][0],3)
return score_dict

def get_train_data(input_file):
"""
get train_data for LFM model
:param input_file: user_item_rating file
:return:
a list: [(userid, itemid, label)]
"""
if not os.path.exists(input_file):
return []
score_dict = get_ave_score(input_file)
neg_dict = {}
pos_dict = {}
train_data = []
linenum=0
score_thr = 4.0
fp = open(input_file, encoding="utf-8")
for line in fp:
line = line.replace('\n','')
if linenum==0:
linenum+=1
continue
item = line.strip().split(',')
if len(item)<4:
continue
userid, itemid, rating = item[0],item[1], float(item[2])
if userid not in pos_dict:
pos_dict[userid] = []
if userid not in neg_dict:
neg_dict[userid] = []
if rating >= score_thr:
pos_dict[userid].append((itemid,1))
else:
score = score_dict.get(itemid,0)
neg_dict[userid].append((itemid, score))
fp.close()
for userid in pos_dict:
data_num = min(len(pos_dict[userid]), len(neg_dict.get(userid, [])))
if data_num>0:
train_data += [(userid, co[0], co[1]) for co in pos_dict[userid]][:data_num]
else:
continue
# 对负样本按照平均评分进行排序,element是[itemid,score],这里表示用户如果对热门的商品评分不高,则很大可能这个用户对这个物品真的没有兴趣
sorted_neg_list = sorted(neg_dict[userid], key=lambda element:element[1], reverse=True)[:data_num]
train_data += [(userid, co[0], 0) for co in sorted_neg_list]
return train_data

LFM model train

LFM模型参数包含隐向量维度F,正则化参数,学习率,迭代次数。训练过程如下:

  • 对每一个用户和item使用标准正态分布初始化一个F维的向量:np.random.randn(F)
  • 根据迭代公式对user和item向量每个维度进行更新
  • 每次迭代后,学习率衰减
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# -*-coding:utf8-*-
"""
author: fivelike
date: 2021.5.17
lfm(latent factor model) model train main function
"""
from tqdm import tqdm
import numpy as np
import sys

sys.path.append("../util")
import util.read as read
import operator


def lfm_train(train_data, F, alpha, beta, step):
"""
train lfm model
:param train_data: train_data for lfm
:param F: user_vec_len, item_vec_len
:param alpha: regularization factor
:param beta: learning rate
:param step: iteration num
:return:
dict: key itemid, value: ndarray
dict: key userid, value: ndarray
"""
user_vec = {}
item_vec = {}
for step_index in tqdm(range(step)):
for data_instance in train_data:
userid, itemid, label = data_instance
if userid not in user_vec:
user_vec[userid] = init_model(F)
if itemid not in item_vec:
item_vec[itemid] = init_model(F)

delta = label - model_predict(user_vec[userid], item_vec[itemid])
for index in range(F):
user_vec[userid][index] += beta * (delta * item_vec[itemid][index] - alpha * user_vec[userid][index])
item_vec[itemid][index] += beta * (delta * user_vec[userid][index] - alpha * item_vec[itemid][index])

beta = beta * 0.9
return user_vec, item_vec


def init_model(vector_len):
"""
init len vector
:param vector_len: the length of vector
:return:
a ndarray
"""
return np.random.randn(vector_len)


def model_predict(user_vector, item_vector):
"""
user_vector and item_vector distance
:param user_vector: model produce user vector
:param item_vector: model produce item vector
:return:
a num
"""
res = np.dot(user_vector, item_vector) / (np.linalg.norm(user_vector) * np.linalg.norm(item_vector))
return res


def model_train_process():
"""
test lfm model train
:return:
"""
train_data = read.get_train_data("../data/ratings.csv")
user_vec, item_vec = lfm_train(train_data, 50, 0.01, 0.1, 50)
recom_result = give_recom_result(user_vec, item_vec, '24')
ana_recom_result(train_data, '24', recom_result)


def give_recom_result(user_vec, item_vec, userid):
"""
user lfm model result give fix userid recom result
:param user_vec: lfm model result
:param item_vec: lfm model result
:param userid: fix userid
:return: a list: [(itemid, score),...]
"""
fix_num = 10
if userid not in user_vec:
return []
record = {}
recom_list = []
user_vector = user_vec[userid]
for itemid in item_vec:
item_vector = item_vec[itemid]
res = np.dot(user_vector, item_vector) / (np.linalg.norm(user_vector) * np.linalg.norm(item_vector))
record[itemid] = res
for r in sorted(record.items(), key=operator.itemgetter(1), reverse=True)[:fix_num]:
itemid = r[0]
score = round(r[1], 3)
recom_list.append((itemid, score))
return recom_list


def ana_recom_result(train_data, userid, recom_list):
"""
debug recom result for userid
:param train_data: train data for lfm model
:param userid: fix userid
:param recom_list: recom result by lfm
"""
item_info = read.get_item_info("../data/movies.csv")
for data_instance in train_data:
tmp_userid, itemid, label = data_instance
if tmp_userid == userid and label == 1:
print(item_info[itemid])
print("recom result")
for r in recom_list:
print(item_info[r[0]])


if __name__ == '__main__':
model_train_process()