ig IPCompleter.greedy=True %env OPENBLAS_NUM_THREADS=1 import pandas as pd import implicit from sklearn.model_selection import train_test_split import scipy.sparse as sps import numpy as np import matplotlib.pyplot as plt import tqdm from collections import defaultdict ! ls -lh yandex_music ! head -n 5 yandex_music/artists.json
%config IPCompleter.greedy=True
%env OPENBLAS_NUM_THREADS=1
import pandas as pd
import implicit
from sklearn.model_selection import train_test_split
import scipy.sparse as sps
import numpy as np
import matplotlib.pyplot as plt
import tqdm
from collections import defaultdict
! ls -lh yandex_music
! head -n 5 yandex_music/artists.jsonl
! head -n 5 yandex_music/events.csv
artists = pd.read_json("yandex_music/artists.jsonl", orient='records', lines=True)
events = pd.read_csv("yandex_music/events.csv")
# most popular artists
(
events
.merge(artists)[['artistName', 'plays']]
.groupby("artistName").sum()
.sort_values('plays', ascending=False)
.head(10)
)
train, test = train_test_split(events, test_size=0.05, random_state=0)
# prepare matrix for implicit library (https://implicit.readthedocs.io/en/latest/models.html)
# item_user (sparse csr_matrix) of item/user/confidence
# csc_matrix((data, (row_ind, col_ind)), [shape=(M, N)])
# where ``data``, ``row_ind`` and ``col_ind`` satisfy the
# relationship ``a[row_ind[k], col_ind[k]] = data[k]``.
item_user_train = sps.csc_matrix((1 + 10 * np.log2(train.plays + 1), (train.artistId, train.userId)))
item_user_train
%%time
model = implicit.als.AlternatingLeastSquares(factors=32, iterations=10, random_state=0)
model.fit(item_user_train)
target_artists = artists[artists.artistName.isin(['Coldplay', '50 Cent', 'AC/DC'])]
artist_to_name = dict(zip(artists.artistId, artists.artistName))
for _, row in target_artists.iterrows():
print("#############", row.artistName, "#############")
for a, s in model.similar_items(row.artistId, N=5):
print(artist_to_name[a], "\t", s)
from sklearn.metrics import ndcg_score
NDCG for iALS
q2: Now let's replace global popularity with iALS prediction for each user.
Make sure to compute NDCG only for users with at least 2 different artists in the test set.
# prediction for user 100 and item 200 (a simple dot product):
print(model.user_factors[100].dot(model.item_factors[200]))
# YOUR CODE HERE
Trending now
This is a popular solution!
Step by step
Solved in 2 steps