import sys from pathlib import Path import string import random import torch import numpy as np import pickle import pandas as pd import os import json import re # refactor as a class with the following methods class Data: def __init__(self, Ufile, Pfile, Sfile, job_meta_file, user_meta_file, user_groups=None, sub_sample_size=1000): self.U, self.P, self.S, self.job_metadata, self.job_metadata_reverse, self.user_metadata = self.load_data(Pfile, Sfile, Ufile, job_meta_file, user_meta_file) # subsample the data self.U_sub = self.sub_sample(self.U, sub_sample_size) self.P_sub = self.sub_sample(self.P, sub_sample_size) self.S_sub = self.sub_sample(self.S, sub_sample_size) # self.U_sub = self.U # self.P_sub = self.P # self.S_sub = self.S self.lookup_dict = {} self.user_temp_data = {} self.user_groups = user_groups def load_data(self, Pfile, Sfile, Ufile, job_meta_file, user_meta_file): U = torch.from_numpy(pickle.load(open(Ufile, 'rb'))) recommendations = torch.from_numpy(pickle.load(open(Pfile, 'rb'))) m, n = recommendations.shape if Sfile: S = torch.from_numpy(pickle.load(open(Sfile, 'rb'))) else: S = U if job_meta_file: job_metadata = pickle.load(open(job_meta_file, 'rb')) else: job_metadata = {} for i in range(n): job_metadata[i] = 'Job {}'.format(i) job_metadata_reverse = {v.capitalize(): k for k, v in job_metadata.items()} if user_meta_file is not None: user_metadata = pickle.load(open(user_meta_file, 'rb')) else: user_metadata = None return U, recommendations, S, job_metadata, job_metadata_reverse, user_metadata def sub_sample(self, M, sample_size=500): if len(M) > sample_size and len(M[0]) > sample_size: # take the first sample_size columns and rows of M, copy without touching the original M = M[:sample_size, :sample_size].clone() return M def update(self, new_user_num, new_job_num): # refactor this function # recdata.lookup_dict = {} # user_temp_data = {} # U = add_jobs(U, new_job_num) # recommendations = update_P(recommendations, new_job_num, 0) # generate a random float between 0 and 1 # prob = random.random() # if prob > 0.2: # recommendations[int(user),-1] = 1. # S = add_jobs(S, new_job_num) # U, recommendations, S = add_jobs(U, new_job_num), add_jobs(recommendations, new_job_num), add_jobs(S, new_job_num) # job_metadata = update_job_metadata(job_metadata, new_job_num) # job_metadata_reverse = {v: k for k, v in job_metadata.items()} if new_job_num > 0 or new_user_num > 0: self.U_sub = self.add_jobs_users(self.U_sub, self.U, new_job_num, new_user_num) self.P_sub = self.add_jobs_users(self.P_sub, self.P, new_job_num, new_user_num) self.S_sub = self.add_jobs_users(self.S_sub, self.S, new_job_num, new_user_num) print('U_sub shape: ', self.U_sub.shape) print('P_sub shape: ', self.P_sub.shape) print('S_sub shape: ', self.S_sub.shape) self.update_job_metadata(new_job_num) self.update_user_metadata(new_user_num) self.lookup_dict = {} self.user_temp_data = {} # def shuffle_rec(P): # rand_rec = P.copy() # rand_rec = rand_rec[:,np.random.permutation(rand_rec.shape[1])] # return rand_rec def add_jobs(self, M_sub, M, new_job_num): # refactor this function, accept one matrix as input if new_job_num == 0: return M_sub if len(M[0]) > len(M_sub[0]) + new_job_num: M_updated = M[:len(M_sub), :len(M_sub[0]) + new_job_num].clone() else: # random number between 0 and 1 with size (S.shape[0],new_job_num) new_jobM = np.random.rand(M.shape[0], new_job_num) # concat new jobM to M as new columns M_updated = np.concatenate((M_sub, new_jobM), axis=1) return M_updated def add_users(self, M_sub, M, new_user_num): # refactor this function, accept one matrix as input if new_user_num == 0: return M_sub if len(M) > len(M_sub) + new_user_num: M_updated = M[:len(M_sub) + new_user_num, :len(M_sub[0])].clone() else: # random number between 0 and 1 with size (new_user_num,S.shape[1]) new_userM = np.random.rand(new_user_num, M.shape[1]) # concat new userM to M as new rows M_updated = np.concatenate((M_sub, new_userM), axis=0) return M_updated def add_jobs_users(self, M_sub, M, new_job_num, new_user_num): # use add_jobs and add_users to add new jobs and users M_updated = self.add_jobs(M_sub, M, new_job_num) M_updated = self.add_users(M_updated, M, new_user_num) print('M_updated shape: ', M_updated.shape) return M_updated def tweak_P(self, this_user): # generate a random float between 0 and 1 prob = random.random() if prob > 0.2: self.P_sub[int(this_user),-1] = 1. # 1 random indices of users within the range of P.shape[0] user_indices = np.random.randint(0, self.P_sub.shape[0], 1) self.P_sub[user_indices, -1] = 1. def update_job_metadata(self, new_job_num): if len(self.P_sub[0]) > len(self.P[0]): for i in range(new_job_num): self.job_metadata[len(self.job_metadata)] = 'Job {}'.format(len(self.job_metadata)) self.job_metadata_reverse['Job {}'.format(len(self.job_metadata_reverse))] = len(self.job_metadata_reverse) def update_user_metadata(self, new_user_num): # TODO: generate fake user metadata for CB if new_user_num > 0: if len(self.P_sub) > len(self.P): # make a new dataframe with new user metadata new_user_metadata = {} new_user_metadata['Id'] = [str(i) for i in range(len(self.user_metadata), len(self.user_metadata) + new_user_num)] new_user_metadata['Sex'] = np.random.choice([0, 1], size=new_user_num, p=[.4, .6]) new_user_metadata['Edu'] = np.random.choice([0, 1, 2], size=new_user_num, p=[.2, .6, 0.2]) new_user_metadata = pd.DataFrame(new_user_metadata) new_user_metadata['Sex'] = new_user_metadata['Sex'].map({0:'F', 1:'M'}) new_user_metadata['Edu'] = new_user_metadata['Edu'].map({0:'High school', 1:'College', 2:'Graduate+'}) # concat new user metadata to old user metadata self.user_metadata = pd.concat([self.user_metadata, new_user_metadata], ignore_index=True) # print(user_metadata)