File size: 5,918 Bytes
54fa0c8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
# import os
# from dataclasses import dataclass, field
# from tqdm import tqdm
# import pandas as pd
# import torch
# from torch.utils.data import DataLoader
# import datasets
# from datasets import load_dataset, Dataset
# from accelerate import Accelerator
# from transformers import HfArgumentParser
# from transformers import AutoModelForTokenClassification, AutoTokenizer
# from typing import Optional
# from utils import PiiNERPipeline
# import time
# @dataclass
# class PipelineArgs:
# model_name: Optional[str] = field(default="./", metadata={"help": "the model name"})
# process_batch_size: int = field(default=10_000, metadata={"help": "files per worker"})
# batch_size: Optional[int] = field(default=1024, metadata={"help": "batch size"})
# dataset: Optional[str] = field(default="./", metadata={"help": "dataset"})
# subset: Optional[str] = field(default="data/python/", metadata={"help": "dataset subdirectory"})
# out_path: Optional[str] = field(default="./results/", metadata={"help": "path for output"})
# email= "[email protected]"
# def main():
# """launch code
# >>>> accelerate config
# >>>> accelerate launch ner_inference.py --process_batch_size=8 --out_path=processed_dataset
# """
# parser = HfArgumentParser(PipelineArgs)
# args = parser.parse_args()
# accelerator = Accelerator()
# out_dir = f"{args.out_path}{args.subset.strip('/').split('/')[-2]}"
# if accelerator.is_main_process:
# if not os.path.exists(out_dir):
# os.mkdir(out_dir)
# dataset = load_dataset(args.dataset, data_dir=args.subset, use_auth_token=True, split="train", num_proc=12)
# dataset = dataset.map(
# lambda example, idx: {
# "id": f"{idx}",
# "max_stars_count": example["max_stars_count"] if example["max_stars_count"] is not None else 0
# },
# with_indices=True, num_proc=12)
# shard_size = (len(dataset))/8
# if shard_size > 1_000_000:
# process_batch_size = 200_000
# elif shard_size > 100_000:
# process_batch_size = 100_000
# else:
# process_batch_size = 10_000
# model = AutoModelForTokenClassification.from_pretrained(args.model_name, use_auth_token=True)
# id_to_label = model.config.id2label
# tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_auth_token=True)
# columns = dataset.column_names
# dataset = dataset.remove_columns([col for col in columns if col not in ["content", "id", "max_stars_repo_name", "max_stars_repo_path", "max_stars_count"]])
# dataloader = DataLoader(dataset, batch_size=process_batch_size, shuffle=False, num_workers=4)
# model, dataloader = accelerator.prepare(model, dataloader)
# pipeline = PiiNERPipeline(
# model,
# tokenizer=tokenizer,
# batch_size=args.batch_size,
# window_size=512,
# device=accelerator.local_process_index,
# num_workers=1,
# use_auth_token=True,
# id_to_label=id_to_label,
# window_overlap=False,
# bf16=True
# )
# num_samples = 0
# for i, batch in enumerate(tqdm(dataloader)):
# # last batches are filled - remove filling
# if i==len(dataloader)-1 and int(batch["id"][0])>int(batch["id"][-1]):
# for j in range(len(batch["id"])-1):
# if int(batch["id"][j])>int(batch["id"][j+1]):
# stop_index = j+1
# for key in batch:
# batch[key] = batch[key][:stop_index]
# result = list(pipeline(datasets.Dataset.from_dict(batch)))
# # add original data
# for k, element in enumerate(result):
# for key in batch:
# element[key] = batch[key][k]
# processed_dataset = Dataset.from_dict(pd.DataFrame(result))
# processed_dataset.to_parquet(f"{out_dir}/job_{accelerator.process_index}_{i}.parquet")
# if __name__ == "__main__":
# main()
# import torch
# from transformers import AutoModelForTokenClassification, AutoTokenizer
# from privacy.util.code_detect.ner.ner_inference import PiiNERPipeline
# from datasets import Dataset
# from privacy.util.code_detect.ner.pii_redaction.utils import get_replacements, redact_pii_batch
# def main():
# # Specify the path to your local model and input code file
# model_path = "pii_inference/nermodel"
# code_file_path = "input_code.java"
# # Load the model and tokenizer
# model = AutoModelForTokenClassification.from_pretrained(model_path)
# tokenizer = AutoTokenizer.from_pretrained(model_path)
# # Create the NER pipeline
# pipeline = PiiNERPipeline(
# model,
# tokenizer=tokenizer,
# batch_size=1024,
# window_size=512,
# device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
# num_workers=1,
# id_to_label=model.config.id2label,
# window_overlap=False,
# bf16=True
# )
# # Read the input code file
# with open(code_file_path, "r") as file:
# code = file.read()
# # Split the code into sentences
# sentences = code.split(". ")
# print(sentences, "SENTENCES")
# # Create an id list
# ids = list(range(len(sentences)))
# # Create a Dataset object from the sentences
# dataset = Dataset.from_dict({"content": sentences, "id": ids})
# # Process the sentences with the NER pipeline
# result = pipeline(dataset)
# replacements = get_replacements()
# # Convert the generator to a list and print the results
# results = list(result)
# print(results, "RESULT")
# # Redact the PII from the results
# redacted_results = redact_pii_batch(results, replacements)
# print(redacted_results, "redacted_results")
# if __name__ == "__main__":
# main()
|