File size: 5,918 Bytes
54fa0c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# import os
# from dataclasses import dataclass, field
# from tqdm import tqdm

# import pandas as pd
# import torch
# from torch.utils.data import DataLoader
# import datasets
# from datasets import load_dataset, Dataset
# from accelerate import Accelerator
# from transformers import HfArgumentParser
# from transformers import AutoModelForTokenClassification, AutoTokenizer
# from typing import Optional
# from utils import PiiNERPipeline
# import time

# @dataclass
# class PipelineArgs:
#     model_name: Optional[str] = field(default="./", metadata={"help": "the model name"})
#     process_batch_size: int = field(default=10_000,  metadata={"help": "files per worker"})
#     batch_size: Optional[int] = field(default=1024, metadata={"help": "batch size"})
#     dataset: Optional[str] = field(default="./", metadata={"help": "dataset"})
#     subset: Optional[str] = field(default="data/python/", metadata={"help": "dataset subdirectory"})
#     out_path: Optional[str] = field(default="./results/", metadata={"help": "path for output"})

#     email= "[email protected]"
# def main():
#     """launch code
#     >>>> accelerate config
#     >>>> accelerate launch ner_inference.py --process_batch_size=8 --out_path=processed_dataset
#     """
#     parser = HfArgumentParser(PipelineArgs)
#     args = parser.parse_args()

#     accelerator = Accelerator()
    
#     out_dir = f"{args.out_path}{args.subset.strip('/').split('/')[-2]}"
#     if accelerator.is_main_process:
#         if not os.path.exists(out_dir):
#             os.mkdir(out_dir)
                                  
#     dataset = load_dataset(args.dataset, data_dir=args.subset, use_auth_token=True, split="train", num_proc=12)
#     dataset = dataset.map(
#         lambda example, idx: {
#             "id": f"{idx}",
#             "max_stars_count": example["max_stars_count"] if example["max_stars_count"] is not None else 0
#             }, 
#             with_indices=True, num_proc=12)
    
#     shard_size = (len(dataset))/8
#     if shard_size > 1_000_000:
#         process_batch_size = 200_000
#     elif shard_size > 100_000:
#         process_batch_size = 100_000
#     else:
#         process_batch_size = 10_000

#     model = AutoModelForTokenClassification.from_pretrained(args.model_name, use_auth_token=True)
#     id_to_label = model.config.id2label
#     tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_auth_token=True)
    
#     columns = dataset.column_names
#     dataset = dataset.remove_columns([col for col in  columns if col not in ["content", "id", "max_stars_repo_name", "max_stars_repo_path", "max_stars_count"]])

#     dataloader = DataLoader(dataset, batch_size=process_batch_size, shuffle=False, num_workers=4)
    
#     model, dataloader = accelerator.prepare(model, dataloader)
    
#     pipeline = PiiNERPipeline(
#         model,
#         tokenizer=tokenizer,
#         batch_size=args.batch_size,
#         window_size=512,
#         device=accelerator.local_process_index,
#         num_workers=1,
#         use_auth_token=True,
#         id_to_label=id_to_label,
#         window_overlap=False,
#         bf16=True
#     )
#     num_samples = 0
#     for i, batch in enumerate(tqdm(dataloader)):
#         # last batches are filled - remove filling
#         if i==len(dataloader)-1 and int(batch["id"][0])>int(batch["id"][-1]):
#             for j in range(len(batch["id"])-1):
#                 if int(batch["id"][j])>int(batch["id"][j+1]):
#                     stop_index = j+1
#             for key in batch:
#                 batch[key] = batch[key][:stop_index]
#         result = list(pipeline(datasets.Dataset.from_dict(batch)))
        
#         # add original data
#         for k, element in enumerate(result):
#             for key in batch:
#                 element[key] = batch[key][k]
        
#         processed_dataset = Dataset.from_dict(pd.DataFrame(result))
#         processed_dataset.to_parquet(f"{out_dir}/job_{accelerator.process_index}_{i}.parquet")

# if __name__ == "__main__":
#     main()

# import torch
# from transformers import AutoModelForTokenClassification, AutoTokenizer
# from privacy.util.code_detect.ner.ner_inference import PiiNERPipeline
# from datasets import Dataset
# from privacy.util.code_detect.ner.pii_redaction.utils import get_replacements, redact_pii_batch
# def main():
#     # Specify the path to your local model and input code file
#     model_path = "pii_inference/nermodel"
#     code_file_path = "input_code.java"

#     # Load the model and tokenizer
#     model = AutoModelForTokenClassification.from_pretrained(model_path)
#     tokenizer = AutoTokenizer.from_pretrained(model_path)

#     # Create the NER pipeline
#     pipeline = PiiNERPipeline(
#         model,
#         tokenizer=tokenizer,
#         batch_size=1024,
#         window_size=512,
#         device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
#         num_workers=1,
#         id_to_label=model.config.id2label,
#         window_overlap=False,
#         bf16=True
#     )

#     # Read the input code file
#     with open(code_file_path, "r") as file:
#         code = file.read()

#         # Split the code into sentences
#     sentences = code.split(". ")
#     print(sentences, "SENTENCES")
#     # Create an id list
#     ids = list(range(len(sentences)))
#     # Create a Dataset object from the sentences
#     dataset = Dataset.from_dict({"content": sentences, "id": ids})

#     # Process the sentences with the NER pipeline
#     result = pipeline(dataset)
#     replacements = get_replacements()
#     # Convert the generator to a list and print the results
#     results = list(result)
#     print(results, "RESULT")
#     # Redact the PII from the results
#     redacted_results = redact_pii_batch(results, replacements)
#     print(redacted_results, "redacted_results")

# if __name__ == "__main__":
#     main()