Spaces:

Kiran5
/

Privacy

Build error

App Files Files Community

Privacy / src /privacy /util /code_detect /ner /CodePIINER.py

Kiran5

Track large files and images with Git LFS

54fa0c8 4 months ago

raw

history blame contribute delete

6.26 kB

	import torch
	from transformers import AutoModelForTokenClassification, AutoTokenizer
	# from pii_inference.utils import PiiNERPipeline
	from datasets import Dataset
	# from pii_redaction.utils import get_replacements, redact_pii_batch
	from privacy.util.code_detect.ner.pii_redaction.utils import get_replacements, redact_pii_batch
	import json
	import re
	import os
	from privacy.util.code_detect.ner.pii_inference.utils.pipeline import PiiNERPipeline
	class codeNer:
	def codeFile(code, filename,model,tokenizer):
	# Specify the path to your local model and input code file
	# model_path = "pii_inference/nermodel"
	# code_file_path = "1.txt"
	# redacted_code_file_path = "1_redacted.txt"
	# Load the model and tokenizers
	# model = AutoModelForTokenClassification.from_pretrained(model1)
	# tokenizer = AutoTokenizer.from_pretrained(tokenizer1)

	# If code is a bytes object, decode it to a string
	if isinstance(code, bytes):
	code = code.decode('utf-8')
	print(code, "CODE")
	# Create the NER pipeline
	pipeline = PiiNERPipeline(
	model,
	tokenizer=tokenizer,
	batch_size=1024,
	window_size=512,
	device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
	num_workers=1,
	id_to_label=model.config.id2label,
	window_overlap=False,
	bf16=True
	)

	# # Read the input code file
	# with open(code_file_path, "r",encoding="utf-8") as file:
	# code = file.read()

	# Split the code into sentences
	# sentences = code.split(". ")
	sentences = re.split('(?<=\. )(?!$\|[a-z])', code)
	# sentences = code
	print(sentences, "############SENTENCES#########")

	# Create an id list
	ids = list(range(len(sentences)))

	# Create a Dataset object from the sentences
	dataset = Dataset.from_dict({"content": sentences, "id": ids})

	replacements = get_replacements()

	# Process the sentences with the NER pipeline
	result = pipeline(dataset)

	# Convert the generator to a list and print the results
	results = list(result)
	print(results, "RESULT")

	# # Check the structure of each result
	# for res in results:
	# print(res)

	# Prepare the examples for redaction
	examples = {
	"content": [res["content"] for res in results],
	"entities": [res["entities"] for res in results]
	}

	# Print examples for debugging
	# print(examples, "EXAMPLES")

	# Redact PII in the batch of examples
	redacted_results = redact_pii_batch(examples, replacements)
	print(redacted_results, "redacted_code_parts")
	# Extract the redacted code from the results
	redacted_code_parts = redacted_results["new_content"]
	print(redacted_code_parts, "redacted_code_parts")
	# redacted_code = "".join(redacted_code_parts)
	# print(redacted_code, "redacted_code")
	# print(redacted_code1, "redacted_code1")
	# Save the redacted code into a new file
	# Generate the output file name based on the input file name
	output_code_file = os.path.splitext(filename)[0] + "_redacted" + os.path.splitext(filename)[1]
	try:
	# with open(output_code_file, "w") as file:
	# for part in redacted_code_parts:
	# file.write(part)
	# print(part, "part")
	# print(part, "part")
	with open(output_code_file, "w") as file:
	# Join all the parts into a single string
	content = ''.join(redacted_code_parts)
	# Write the content to the file
	file.write(content)
	print(content, "content")
	# with open(output_code_file, "r") as file:
	# content = file.read()
	# print(content,"content from function")
	# file.write(redacted_code_parts)
	except Exception as e:
	print(f"An error occurred while writing to the file: {e}")

	return content, output_code_file

	def codeText(code, model, tokenizer):
	# If code is a bytes object, decode it to a string
	if isinstance(code, bytes):
	code = code.decode('utf-8')
	print(code, "CODE")
	# Create the NER pipeline
	pipeline = PiiNERPipeline(
	model,
	tokenizer=tokenizer,
	batch_size=1024,
	window_size=512,
	device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
	num_workers=1,
	id_to_label=model.config.id2label,
	window_overlap=False,
	bf16=True
	)

	# Split the code into sentences
	sentences = re.split('(?<=\. )(?!$\|[a-z])', code)
	print(sentences, "############SENTENCES#########")

	# Create an id list
	ids = list(range(len(sentences)))

	# Create a Dataset object from the sentences
	dataset = Dataset.from_dict({"content": sentences, "id": ids})

	replacements = get_replacements()

	# Process the sentences with the NER pipeline
	result = pipeline(dataset)

	# Convert the generator to a list and print the results
	results = list(result)
	print(results, "RESULT")

	# Prepare the examples for redaction
	examples = {
	"content": [res["content"] for res in results],
	"entities": [res["entities"] for res in results]
	}

	# Redact PII in the batch of examples
	redacted_results = redact_pii_batch(examples, replacements)
	print(redacted_results, "redacted_code_parts")
	# Extract the redacted code from the results
	redacted_code_parts = redacted_results["new_content"]
	print(redacted_code_parts, "redacted_code_parts")

	# Join all the parts into a single string
	redacted_code = ''.join(redacted_code_parts)
	print(redacted_code, "redacted_code")

	return redacted_code

	# if __name__ == "__main__":
	# main()