Spaces:
Sleeping
Sleeping
import sys | |
from pathlib import Path | |
import string | |
import random | |
import torch | |
import numpy as np | |
import pickle | |
import pandas as pd | |
import os | |
import json | |
import re | |
# refactor as a class with the following methods | |
class Data: | |
def __init__(self, Ufile, Pfile, Sfile, job_meta_file, user_meta_file, user_groups=None, sub_sample_size=1000): | |
self.U, self.P, self.S, self.job_metadata, self.job_metadata_reverse, self.user_metadata = self.load_data(Pfile, Sfile, Ufile, job_meta_file, user_meta_file) | |
# subsample the data | |
self.U_sub = self.sub_sample(self.U, sub_sample_size) | |
self.P_sub = self.sub_sample(self.P, sub_sample_size) | |
self.S_sub = self.sub_sample(self.S, sub_sample_size) | |
# self.U_sub = self.U | |
# self.P_sub = self.P | |
# self.S_sub = self.S | |
self.lookup_dict = {} | |
self.user_temp_data = {} | |
self.user_groups = user_groups | |
def load_data(self, Pfile, Sfile, Ufile, job_meta_file, user_meta_file): | |
U = torch.from_numpy(pickle.load(open(Ufile, 'rb'))) | |
recommendations = torch.from_numpy(pickle.load(open(Pfile, 'rb'))) | |
m, n = recommendations.shape | |
if Sfile: | |
S = torch.from_numpy(pickle.load(open(Sfile, 'rb'))) | |
else: | |
S = U | |
if job_meta_file: | |
job_metadata = pickle.load(open(job_meta_file, 'rb')) | |
else: | |
job_metadata = {} | |
for i in range(n): | |
job_metadata[i] = 'Job {}'.format(i) | |
job_metadata_reverse = {v.capitalize(): k for k, v in job_metadata.items()} | |
if user_meta_file is not None: | |
user_metadata = pickle.load(open(user_meta_file, 'rb')) | |
else: | |
user_metadata = None | |
return U, recommendations, S, job_metadata, job_metadata_reverse, user_metadata | |
def sub_sample(self, M, sample_size=500): | |
if len(M) > sample_size and len(M[0]) > sample_size: | |
# take the first sample_size columns and rows of M, copy without touching the original | |
M = M[:sample_size, :sample_size].clone() | |
return M | |
def update(self, new_user_num, new_job_num): | |
# refactor this function | |
# recdata.lookup_dict = {} | |
# user_temp_data = {} | |
# U = add_jobs(U, new_job_num) | |
# recommendations = update_P(recommendations, new_job_num, 0) | |
# generate a random float between 0 and 1 | |
# prob = random.random() | |
# if prob > 0.2: | |
# recommendations[int(user),-1] = 1. | |
# S = add_jobs(S, new_job_num) | |
# U, recommendations, S = add_jobs(U, new_job_num), add_jobs(recommendations, new_job_num), add_jobs(S, new_job_num) | |
# job_metadata = update_job_metadata(job_metadata, new_job_num) | |
# job_metadata_reverse = {v: k for k, v in job_metadata.items()} | |
if new_job_num > 0 or new_user_num > 0: | |
self.U_sub = self.add_jobs_users(self.U_sub, self.U, new_job_num, new_user_num) | |
self.P_sub = self.add_jobs_users(self.P_sub, self.P, new_job_num, new_user_num) | |
self.S_sub = self.add_jobs_users(self.S_sub, self.S, new_job_num, new_user_num) | |
print('U_sub shape: ', self.U_sub.shape) | |
print('P_sub shape: ', self.P_sub.shape) | |
print('S_sub shape: ', self.S_sub.shape) | |
self.update_job_metadata(new_job_num) | |
self.update_user_metadata(new_user_num) | |
self.lookup_dict = {} | |
self.user_temp_data = {} | |
# def shuffle_rec(P): | |
# rand_rec = P.copy() | |
# rand_rec = rand_rec[:,np.random.permutation(rand_rec.shape[1])] | |
# return rand_rec | |
def add_jobs(self, M_sub, M, new_job_num): # refactor this function, accept one matrix as input | |
if new_job_num == 0: | |
return M_sub | |
if len(M[0]) > len(M_sub[0]) + new_job_num: | |
M_updated = M[:len(M_sub), :len(M_sub[0]) + new_job_num].clone() | |
else: | |
# random number between 0 and 1 with size (S.shape[0],new_job_num) | |
new_jobM = np.random.rand(M.shape[0], new_job_num) | |
# concat new jobM to M as new columns | |
M_updated = np.concatenate((M_sub, new_jobM), axis=1) | |
return M_updated | |
def add_users(self, M_sub, M, new_user_num): # refactor this function, accept one matrix as input | |
if new_user_num == 0: | |
return M_sub | |
if len(M) > len(M_sub) + new_user_num: | |
M_updated = M[:len(M_sub) + new_user_num, :len(M_sub[0])].clone() | |
else: | |
# random number between 0 and 1 with size (new_user_num,S.shape[1]) | |
new_userM = np.random.rand(new_user_num, M.shape[1]) | |
# concat new userM to M as new rows | |
M_updated = np.concatenate((M_sub, new_userM), axis=0) | |
return M_updated | |
def add_jobs_users(self, M_sub, M, new_job_num, new_user_num): | |
# use add_jobs and add_users to add new jobs and users | |
M_updated = self.add_jobs(M_sub, M, new_job_num) | |
M_updated = self.add_users(M_updated, M, new_user_num) | |
print('M_updated shape: ', M_updated.shape) | |
return M_updated | |
def tweak_P(self, this_user): | |
# generate a random float between 0 and 1 | |
prob = random.random() | |
if prob > 0.2: | |
self.P_sub[int(this_user),-1] = 1. | |
# 1 random indices of users within the range of P.shape[0] | |
user_indices = np.random.randint(0, self.P_sub.shape[0], 1) | |
self.P_sub[user_indices, -1] = 1. | |
def update_job_metadata(self, new_job_num): | |
if len(self.P_sub[0]) > len(self.P[0]): | |
for i in range(new_job_num): | |
self.job_metadata[len(self.job_metadata)] = 'Job {}'.format(len(self.job_metadata)) | |
self.job_metadata_reverse['Job {}'.format(len(self.job_metadata_reverse))] = len(self.job_metadata_reverse) | |
def update_user_metadata(self, new_user_num): # TODO: generate fake user metadata for CB | |
if new_user_num > 0: | |
if len(self.P_sub) > len(self.P): | |
# make a new dataframe with new user metadata | |
new_user_metadata = {} | |
new_user_metadata['Id'] = [str(i) for i in range(len(self.user_metadata), len(self.user_metadata) + new_user_num)] | |
new_user_metadata['Sex'] = np.random.choice([0, 1], size=new_user_num, p=[.4, .6]) | |
new_user_metadata['Edu'] = np.random.choice([0, 1, 2], size=new_user_num, p=[.2, .6, 0.2]) | |
new_user_metadata = pd.DataFrame(new_user_metadata) | |
new_user_metadata['Sex'] = new_user_metadata['Sex'].map({0:'F', 1:'M'}) | |
new_user_metadata['Edu'] = new_user_metadata['Edu'].map({0:'High school', 1:'College', 2:'Graduate+'}) | |
# concat new user metadata to old user metadata | |
self.user_metadata = pd.concat([self.user_metadata, new_user_metadata], ignore_index=True) | |
# print(user_metadata) | |