File size: 6,981 Bytes
c3279e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import sys

from pathlib import Path
import string
import random
import torch
import numpy as np
import pickle
import pandas as pd
import os
import json
import re



# refactor as a class with the following methods
class Data:
    def __init__(self, Ufile, Pfile, Sfile, job_meta_file, user_meta_file, user_groups=None, sub_sample_size=1000):
        self.U, self.P, self.S, self.job_metadata, self.job_metadata_reverse, self.user_metadata = self.load_data(Pfile, Sfile, Ufile, job_meta_file, user_meta_file)
        # subsample the data
        self.U_sub = self.sub_sample(self.U, sub_sample_size)
        self.P_sub = self.sub_sample(self.P, sub_sample_size)
        self.S_sub = self.sub_sample(self.S, sub_sample_size)
        # self.U_sub = self.U
        # self.P_sub = self.P
        # self.S_sub = self.S
        self.lookup_dict = {}
        self.user_temp_data = {}
        self.user_groups = user_groups


    def load_data(self, Pfile, Sfile, Ufile, job_meta_file, user_meta_file):
        U = torch.from_numpy(pickle.load(open(Ufile, 'rb')))
        recommendations = torch.from_numpy(pickle.load(open(Pfile, 'rb')))
        m, n = recommendations.shape
        if Sfile:
            S = torch.from_numpy(pickle.load(open(Sfile, 'rb')))
        else:
            S = U
        if job_meta_file:
            job_metadata = pickle.load(open(job_meta_file, 'rb'))
        else:
            job_metadata = {}
            for i in range(n):
                job_metadata[i] = 'Job {}'.format(i)
        job_metadata_reverse = {v.capitalize(): k for k, v in job_metadata.items()}
        if user_meta_file is not None:
            user_metadata = pickle.load(open(user_meta_file, 'rb'))
        else:
            user_metadata = None

        return U, recommendations, S, job_metadata, job_metadata_reverse, user_metadata
    
    def sub_sample(self, M, sample_size=500):
        if len(M) > sample_size and len(M[0]) > sample_size:
            # take the first sample_size columns and rows of M, copy without touching the original
            M = M[:sample_size, :sample_size].clone()
        return M
    
    def update(self, new_user_num, new_job_num):
        # refactor this function
        # recdata.lookup_dict = {}
        # user_temp_data = {}
        # U = add_jobs(U, new_job_num)
        # recommendations = update_P(recommendations, new_job_num, 0)
        # generate a random float between 0 and 1
        # prob = random.random()
        # if prob > 0.2:
        #     recommendations[int(user),-1] = 1.
        # S = add_jobs(S, new_job_num)
        # U, recommendations, S = add_jobs(U, new_job_num), add_jobs(recommendations, new_job_num), add_jobs(S, new_job_num)
        # job_metadata = update_job_metadata(job_metadata, new_job_num)
        # job_metadata_reverse = {v: k for k, v in job_metadata.items()}
        if new_job_num > 0 or new_user_num > 0:
            self.U_sub = self.add_jobs_users(self.U_sub, self.U, new_job_num, new_user_num)
            self.P_sub = self.add_jobs_users(self.P_sub, self.P, new_job_num, new_user_num)
            self.S_sub = self.add_jobs_users(self.S_sub, self.S, new_job_num, new_user_num)
            print('U_sub shape: ', self.U_sub.shape)
            print('P_sub shape: ', self.P_sub.shape)
            print('S_sub shape: ', self.S_sub.shape)
            self.update_job_metadata(new_job_num)
            self.update_user_metadata(new_user_num)
            self.lookup_dict = {}
            self.user_temp_data = {}



    # def shuffle_rec(P):
    #     rand_rec = P.copy()
    #     rand_rec = rand_rec[:,np.random.permutation(rand_rec.shape[1])]
    #     return rand_rec

    def add_jobs(self, M_sub, M, new_job_num): # refactor this function, accept one matrix as input
        if new_job_num == 0:
            return M_sub
        if len(M[0]) > len(M_sub[0]) + new_job_num:
            M_updated = M[:len(M_sub), :len(M_sub[0]) + new_job_num].clone()
        else:
            # random number between 0 and 1 with size (S.shape[0],new_job_num)
            new_jobM = np.random.rand(M.shape[0], new_job_num)
            # concat new jobM to M as new columns
            M_updated = np.concatenate((M_sub, new_jobM), axis=1)

        return M_updated

    def add_users(self, M_sub, M, new_user_num): # refactor this function, accept one matrix as input
        if new_user_num == 0:
            return M_sub
        if len(M) > len(M_sub) + new_user_num:
            M_updated = M[:len(M_sub) + new_user_num, :len(M_sub[0])].clone()
        else:
            # random number between 0 and 1 with size (new_user_num,S.shape[1])
            new_userM = np.random.rand(new_user_num, M.shape[1])
            # concat new userM to M as new rows
            M_updated = np.concatenate((M_sub, new_userM), axis=0)

        return M_updated


    def add_jobs_users(self, M_sub, M, new_job_num, new_user_num):
        # use add_jobs and add_users to add new jobs and users
        M_updated = self.add_jobs(M_sub, M, new_job_num)
        M_updated = self.add_users(M_updated, M, new_user_num)
        print('M_updated shape: ', M_updated.shape)
        return M_updated

    def tweak_P(self, this_user):
        # generate a random float between 0 and 1
        prob = random.random()
        if prob > 0.2:
            self.P_sub[int(this_user),-1] = 1.
            # 1 random indices of users within the range of P.shape[0]
            user_indices = np.random.randint(0, self.P_sub.shape[0], 1)
            self.P_sub[user_indices, -1] = 1.

    def update_job_metadata(self, new_job_num):
        if len(self.P_sub[0]) > len(self.P[0]):
            for i in range(new_job_num):
                self.job_metadata[len(self.job_metadata)] = 'Job {}'.format(len(self.job_metadata))
                self.job_metadata_reverse['Job {}'.format(len(self.job_metadata_reverse))] = len(self.job_metadata_reverse)


    def update_user_metadata(self, new_user_num): # TODO: generate fake user metadata for CB
        if new_user_num > 0:
            if len(self.P_sub) > len(self.P):
                # make a new dataframe with new user metadata
                new_user_metadata = {}
                new_user_metadata['Id'] = [str(i) for i in range(len(self.user_metadata), len(self.user_metadata) + new_user_num)]
                new_user_metadata['Sex'] = np.random.choice([0, 1], size=new_user_num, p=[.4, .6])
                new_user_metadata['Edu'] = np.random.choice([0, 1, 2], size=new_user_num, p=[.2, .6, 0.2])
                new_user_metadata = pd.DataFrame(new_user_metadata)
                new_user_metadata['Sex'] = new_user_metadata['Sex'].map({0:'F', 1:'M'})
                new_user_metadata['Edu'] = new_user_metadata['Edu'].map({0:'High school', 1:'College', 2:'Graduate+'})
                # concat new user metadata to old user metadata
                self.user_metadata = pd.concat([self.user_metadata, new_user_metadata], ignore_index=True)
                # print(user_metadata)