Spaces:
Sleeping
Sleeping
File size: 6,981 Bytes
c3279e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import sys
from pathlib import Path
import string
import random
import torch
import numpy as np
import pickle
import pandas as pd
import os
import json
import re
# refactor as a class with the following methods
class Data:
def __init__(self, Ufile, Pfile, Sfile, job_meta_file, user_meta_file, user_groups=None, sub_sample_size=1000):
self.U, self.P, self.S, self.job_metadata, self.job_metadata_reverse, self.user_metadata = self.load_data(Pfile, Sfile, Ufile, job_meta_file, user_meta_file)
# subsample the data
self.U_sub = self.sub_sample(self.U, sub_sample_size)
self.P_sub = self.sub_sample(self.P, sub_sample_size)
self.S_sub = self.sub_sample(self.S, sub_sample_size)
# self.U_sub = self.U
# self.P_sub = self.P
# self.S_sub = self.S
self.lookup_dict = {}
self.user_temp_data = {}
self.user_groups = user_groups
def load_data(self, Pfile, Sfile, Ufile, job_meta_file, user_meta_file):
U = torch.from_numpy(pickle.load(open(Ufile, 'rb')))
recommendations = torch.from_numpy(pickle.load(open(Pfile, 'rb')))
m, n = recommendations.shape
if Sfile:
S = torch.from_numpy(pickle.load(open(Sfile, 'rb')))
else:
S = U
if job_meta_file:
job_metadata = pickle.load(open(job_meta_file, 'rb'))
else:
job_metadata = {}
for i in range(n):
job_metadata[i] = 'Job {}'.format(i)
job_metadata_reverse = {v.capitalize(): k for k, v in job_metadata.items()}
if user_meta_file is not None:
user_metadata = pickle.load(open(user_meta_file, 'rb'))
else:
user_metadata = None
return U, recommendations, S, job_metadata, job_metadata_reverse, user_metadata
def sub_sample(self, M, sample_size=500):
if len(M) > sample_size and len(M[0]) > sample_size:
# take the first sample_size columns and rows of M, copy without touching the original
M = M[:sample_size, :sample_size].clone()
return M
def update(self, new_user_num, new_job_num):
# refactor this function
# recdata.lookup_dict = {}
# user_temp_data = {}
# U = add_jobs(U, new_job_num)
# recommendations = update_P(recommendations, new_job_num, 0)
# generate a random float between 0 and 1
# prob = random.random()
# if prob > 0.2:
# recommendations[int(user),-1] = 1.
# S = add_jobs(S, new_job_num)
# U, recommendations, S = add_jobs(U, new_job_num), add_jobs(recommendations, new_job_num), add_jobs(S, new_job_num)
# job_metadata = update_job_metadata(job_metadata, new_job_num)
# job_metadata_reverse = {v: k for k, v in job_metadata.items()}
if new_job_num > 0 or new_user_num > 0:
self.U_sub = self.add_jobs_users(self.U_sub, self.U, new_job_num, new_user_num)
self.P_sub = self.add_jobs_users(self.P_sub, self.P, new_job_num, new_user_num)
self.S_sub = self.add_jobs_users(self.S_sub, self.S, new_job_num, new_user_num)
print('U_sub shape: ', self.U_sub.shape)
print('P_sub shape: ', self.P_sub.shape)
print('S_sub shape: ', self.S_sub.shape)
self.update_job_metadata(new_job_num)
self.update_user_metadata(new_user_num)
self.lookup_dict = {}
self.user_temp_data = {}
# def shuffle_rec(P):
# rand_rec = P.copy()
# rand_rec = rand_rec[:,np.random.permutation(rand_rec.shape[1])]
# return rand_rec
def add_jobs(self, M_sub, M, new_job_num): # refactor this function, accept one matrix as input
if new_job_num == 0:
return M_sub
if len(M[0]) > len(M_sub[0]) + new_job_num:
M_updated = M[:len(M_sub), :len(M_sub[0]) + new_job_num].clone()
else:
# random number between 0 and 1 with size (S.shape[0],new_job_num)
new_jobM = np.random.rand(M.shape[0], new_job_num)
# concat new jobM to M as new columns
M_updated = np.concatenate((M_sub, new_jobM), axis=1)
return M_updated
def add_users(self, M_sub, M, new_user_num): # refactor this function, accept one matrix as input
if new_user_num == 0:
return M_sub
if len(M) > len(M_sub) + new_user_num:
M_updated = M[:len(M_sub) + new_user_num, :len(M_sub[0])].clone()
else:
# random number between 0 and 1 with size (new_user_num,S.shape[1])
new_userM = np.random.rand(new_user_num, M.shape[1])
# concat new userM to M as new rows
M_updated = np.concatenate((M_sub, new_userM), axis=0)
return M_updated
def add_jobs_users(self, M_sub, M, new_job_num, new_user_num):
# use add_jobs and add_users to add new jobs and users
M_updated = self.add_jobs(M_sub, M, new_job_num)
M_updated = self.add_users(M_updated, M, new_user_num)
print('M_updated shape: ', M_updated.shape)
return M_updated
def tweak_P(self, this_user):
# generate a random float between 0 and 1
prob = random.random()
if prob > 0.2:
self.P_sub[int(this_user),-1] = 1.
# 1 random indices of users within the range of P.shape[0]
user_indices = np.random.randint(0, self.P_sub.shape[0], 1)
self.P_sub[user_indices, -1] = 1.
def update_job_metadata(self, new_job_num):
if len(self.P_sub[0]) > len(self.P[0]):
for i in range(new_job_num):
self.job_metadata[len(self.job_metadata)] = 'Job {}'.format(len(self.job_metadata))
self.job_metadata_reverse['Job {}'.format(len(self.job_metadata_reverse))] = len(self.job_metadata_reverse)
def update_user_metadata(self, new_user_num): # TODO: generate fake user metadata for CB
if new_user_num > 0:
if len(self.P_sub) > len(self.P):
# make a new dataframe with new user metadata
new_user_metadata = {}
new_user_metadata['Id'] = [str(i) for i in range(len(self.user_metadata), len(self.user_metadata) + new_user_num)]
new_user_metadata['Sex'] = np.random.choice([0, 1], size=new_user_num, p=[.4, .6])
new_user_metadata['Edu'] = np.random.choice([0, 1, 2], size=new_user_num, p=[.2, .6, 0.2])
new_user_metadata = pd.DataFrame(new_user_metadata)
new_user_metadata['Sex'] = new_user_metadata['Sex'].map({0:'F', 1:'M'})
new_user_metadata['Edu'] = new_user_metadata['Edu'].map({0:'High school', 1:'College', 2:'Graduate+'})
# concat new user metadata to old user metadata
self.user_metadata = pd.concat([self.user_metadata, new_user_metadata], ignore_index=True)
# print(user_metadata)
|