NanLi2021's picture
init
c3279e7
import sys
from pathlib import Path
import string
import random
import torch
import numpy as np
import pickle
import pandas as pd
import os
import json
import re
# refactor as a class with the following methods
class Data:
def __init__(self, Ufile, Pfile, Sfile, job_meta_file, user_meta_file, user_groups=None, sub_sample_size=1000):
self.U, self.P, self.S, self.job_metadata, self.job_metadata_reverse, self.user_metadata = self.load_data(Pfile, Sfile, Ufile, job_meta_file, user_meta_file)
# subsample the data
self.U_sub = self.sub_sample(self.U, sub_sample_size)
self.P_sub = self.sub_sample(self.P, sub_sample_size)
self.S_sub = self.sub_sample(self.S, sub_sample_size)
# self.U_sub = self.U
# self.P_sub = self.P
# self.S_sub = self.S
self.lookup_dict = {}
self.user_temp_data = {}
self.user_groups = user_groups
def load_data(self, Pfile, Sfile, Ufile, job_meta_file, user_meta_file):
U = torch.from_numpy(pickle.load(open(Ufile, 'rb')))
recommendations = torch.from_numpy(pickle.load(open(Pfile, 'rb')))
m, n = recommendations.shape
if Sfile:
S = torch.from_numpy(pickle.load(open(Sfile, 'rb')))
else:
S = U
if job_meta_file:
job_metadata = pickle.load(open(job_meta_file, 'rb'))
else:
job_metadata = {}
for i in range(n):
job_metadata[i] = 'Job {}'.format(i)
job_metadata_reverse = {v.capitalize(): k for k, v in job_metadata.items()}
if user_meta_file is not None:
user_metadata = pickle.load(open(user_meta_file, 'rb'))
else:
user_metadata = None
return U, recommendations, S, job_metadata, job_metadata_reverse, user_metadata
def sub_sample(self, M, sample_size=500):
if len(M) > sample_size and len(M[0]) > sample_size:
# take the first sample_size columns and rows of M, copy without touching the original
M = M[:sample_size, :sample_size].clone()
return M
def update(self, new_user_num, new_job_num):
# refactor this function
# recdata.lookup_dict = {}
# user_temp_data = {}
# U = add_jobs(U, new_job_num)
# recommendations = update_P(recommendations, new_job_num, 0)
# generate a random float between 0 and 1
# prob = random.random()
# if prob > 0.2:
# recommendations[int(user),-1] = 1.
# S = add_jobs(S, new_job_num)
# U, recommendations, S = add_jobs(U, new_job_num), add_jobs(recommendations, new_job_num), add_jobs(S, new_job_num)
# job_metadata = update_job_metadata(job_metadata, new_job_num)
# job_metadata_reverse = {v: k for k, v in job_metadata.items()}
if new_job_num > 0 or new_user_num > 0:
self.U_sub = self.add_jobs_users(self.U_sub, self.U, new_job_num, new_user_num)
self.P_sub = self.add_jobs_users(self.P_sub, self.P, new_job_num, new_user_num)
self.S_sub = self.add_jobs_users(self.S_sub, self.S, new_job_num, new_user_num)
print('U_sub shape: ', self.U_sub.shape)
print('P_sub shape: ', self.P_sub.shape)
print('S_sub shape: ', self.S_sub.shape)
self.update_job_metadata(new_job_num)
self.update_user_metadata(new_user_num)
self.lookup_dict = {}
self.user_temp_data = {}
# def shuffle_rec(P):
# rand_rec = P.copy()
# rand_rec = rand_rec[:,np.random.permutation(rand_rec.shape[1])]
# return rand_rec
def add_jobs(self, M_sub, M, new_job_num): # refactor this function, accept one matrix as input
if new_job_num == 0:
return M_sub
if len(M[0]) > len(M_sub[0]) + new_job_num:
M_updated = M[:len(M_sub), :len(M_sub[0]) + new_job_num].clone()
else:
# random number between 0 and 1 with size (S.shape[0],new_job_num)
new_jobM = np.random.rand(M.shape[0], new_job_num)
# concat new jobM to M as new columns
M_updated = np.concatenate((M_sub, new_jobM), axis=1)
return M_updated
def add_users(self, M_sub, M, new_user_num): # refactor this function, accept one matrix as input
if new_user_num == 0:
return M_sub
if len(M) > len(M_sub) + new_user_num:
M_updated = M[:len(M_sub) + new_user_num, :len(M_sub[0])].clone()
else:
# random number between 0 and 1 with size (new_user_num,S.shape[1])
new_userM = np.random.rand(new_user_num, M.shape[1])
# concat new userM to M as new rows
M_updated = np.concatenate((M_sub, new_userM), axis=0)
return M_updated
def add_jobs_users(self, M_sub, M, new_job_num, new_user_num):
# use add_jobs and add_users to add new jobs and users
M_updated = self.add_jobs(M_sub, M, new_job_num)
M_updated = self.add_users(M_updated, M, new_user_num)
print('M_updated shape: ', M_updated.shape)
return M_updated
def tweak_P(self, this_user):
# generate a random float between 0 and 1
prob = random.random()
if prob > 0.2:
self.P_sub[int(this_user),-1] = 1.
# 1 random indices of users within the range of P.shape[0]
user_indices = np.random.randint(0, self.P_sub.shape[0], 1)
self.P_sub[user_indices, -1] = 1.
def update_job_metadata(self, new_job_num):
if len(self.P_sub[0]) > len(self.P[0]):
for i in range(new_job_num):
self.job_metadata[len(self.job_metadata)] = 'Job {}'.format(len(self.job_metadata))
self.job_metadata_reverse['Job {}'.format(len(self.job_metadata_reverse))] = len(self.job_metadata_reverse)
def update_user_metadata(self, new_user_num): # TODO: generate fake user metadata for CB
if new_user_num > 0:
if len(self.P_sub) > len(self.P):
# make a new dataframe with new user metadata
new_user_metadata = {}
new_user_metadata['Id'] = [str(i) for i in range(len(self.user_metadata), len(self.user_metadata) + new_user_num)]
new_user_metadata['Sex'] = np.random.choice([0, 1], size=new_user_num, p=[.4, .6])
new_user_metadata['Edu'] = np.random.choice([0, 1, 2], size=new_user_num, p=[.2, .6, 0.2])
new_user_metadata = pd.DataFrame(new_user_metadata)
new_user_metadata['Sex'] = new_user_metadata['Sex'].map({0:'F', 1:'M'})
new_user_metadata['Edu'] = new_user_metadata['Edu'].map({0:'High school', 1:'College', 2:'Graduate+'})
# concat new user metadata to old user metadata
self.user_metadata = pd.concat([self.user_metadata, new_user_metadata], ignore_index=True)
# print(user_metadata)