Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -2,105 +2,94 @@ import os
|
|
2 |
import json
|
3 |
import logging
|
4 |
import nltk
|
5 |
-
from nltk import word_tokenize, pos_tag
|
6 |
from tqdm import tqdm
|
7 |
-
import
|
8 |
-
from transformers import
|
9 |
-
from datasets import Dataset
|
10 |
-
from huggingface_hub import HfApi
|
11 |
-
import shutil
|
12 |
-
import torch
|
13 |
|
14 |
-
# Setup
|
15 |
-
|
16 |
-
|
17 |
-
logger = logging.getLogger(__name__)
|
18 |
|
19 |
# Download NLTK data
|
20 |
nltk.download('punkt')
|
21 |
nltk.download('averaged_perceptron_tagger')
|
|
|
22 |
|
23 |
-
#
|
24 |
-
|
25 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
26 |
-
model = AutoModelForCausalLM.from_pretrained(
|
27 |
-
model_name,
|
28 |
-
trust_remote_code=True,
|
29 |
-
torch_dtype=torch.float32, # CPU compatibility
|
30 |
-
device_map="cpu"
|
31 |
-
)
|
32 |
|
33 |
-
#
|
34 |
-
|
35 |
-
|
|
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
return
|
49 |
|
50 |
-
#
|
51 |
-
def
|
52 |
-
|
53 |
-
|
54 |
-
for entry in tqdm(dataset, desc="Writing JSONL"):
|
55 |
-
f.write(json.dumps(entry) + '\n')
|
56 |
|
57 |
-
# Push to HuggingFace
|
58 |
-
def push_to_hf(dataset_path):
|
59 |
-
logger.info("Pushing to HuggingFace dataset: katsukiai/DeepFocus-X3")
|
60 |
api = HfApi()
|
61 |
-
|
62 |
-
dataset.push_to_hub("katsukiai/DeepFocus-X3", token=os.environ["HF_TOKEN"])
|
63 |
-
logger.info("Dataset pushed successfully")
|
64 |
|
65 |
-
#
|
66 |
-
|
67 |
-
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024).to("cpu")
|
68 |
-
outputs = model.generate(**inputs, max_length=2048, num_return_sequences=1)
|
69 |
-
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
70 |
|
71 |
-
#
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
output_file = os.path.join(converted_dir, "output.jsonl")
|
77 |
-
convert_to_jsonl(dataset, output_file)
|
78 |
-
push_to_hf(output_file)
|
79 |
-
return json.dumps(dataset, indent=2)
|
80 |
|
81 |
-
#
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
-
|
86 |
-
|
|
|
|
|
|
|
87 |
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
-
|
96 |
-
text_input = st.text_area("Input Text", height=200)
|
97 |
-
if st.button("Convert & Push"):
|
98 |
-
with st.spinner("Processing..."):
|
99 |
-
result = convert_text(text_input)
|
100 |
-
st.text_area("JSON Output", value=result, height=200)
|
101 |
|
|
|
102 |
if __name__ == "__main__":
|
103 |
-
|
104 |
-
|
105 |
-
# Cleanup (optional)
|
106 |
-
shutil.rmtree(converted_dir, ignore_errors=True)
|
|
|
2 |
import json
|
3 |
import logging
|
4 |
import nltk
|
|
|
5 |
from tqdm import tqdm
|
6 |
+
import gradio as gr
|
7 |
+
from transformers import pipeline
|
|
|
|
|
|
|
|
|
8 |
|
9 |
+
# Setup logging
|
10 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
11 |
+
logger = logging.getLogger()
|
|
|
12 |
|
13 |
# Download NLTK data
|
14 |
nltk.download('punkt')
|
15 |
nltk.download('averaged_perceptron_tagger')
|
16 |
+
nltk.download('wordnet')
|
17 |
|
18 |
+
# Initialize DeepSeek AI pipeline for long text processing
|
19 |
+
deepseek_pipeline = pipeline('text-generation', model='DeepSeekAI/DeepFocus-X3')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
+
# Function to process text and convert to JSONL
|
22 |
+
def text_to_jsonl(text):
|
23 |
+
sentences = nltk.sent_tokenize(text)
|
24 |
+
jsonl_data = []
|
25 |
|
26 |
+
for sentence in tqdm(sentences, desc="Processing sentences"):
|
27 |
+
words = nltk.word_tokenize(sentence)
|
28 |
+
pos_tags = nltk.pos_tag(words)
|
29 |
+
meanings = [nltk.corpus.wordnet.synsets(word)[0].definition() if nltk.corpus.wordnet.synsets(word) else 'No definition' for word in words]
|
30 |
+
|
31 |
+
jsonl_data.append({
|
32 |
+
"tokenizer": pos_tags,
|
33 |
+
"words": words,
|
34 |
+
"meaning": meanings
|
35 |
+
})
|
36 |
+
|
37 |
+
return jsonl_data
|
38 |
|
39 |
+
# Function to push data to Hugging Face dataset repository
|
40 |
+
def push_to_huggingface(jsonl_data, repo_name='katsukiai/DeepFocus-X3'):
|
41 |
+
import huggingface_hub
|
42 |
+
from huggingface_hub import HfApi, DatasetCard
|
|
|
|
|
43 |
|
|
|
|
|
|
|
44 |
api = HfApi()
|
45 |
+
repo_id = f"{repo_name}"
|
|
|
|
|
46 |
|
47 |
+
# Create or get the dataset repository
|
48 |
+
api.create_repo(repo_id, repo_type="dataset", private=False, exist_ok=True)
|
|
|
|
|
|
|
49 |
|
50 |
+
# Write JSONL data to a file
|
51 |
+
jsonl_file_path = "data.jsonl"
|
52 |
+
with open(jsonl_file_path, "w") as f:
|
53 |
+
for item in jsonl_data:
|
54 |
+
f.write(json.dumps(item) + "\n")
|
|
|
|
|
|
|
|
|
55 |
|
56 |
+
# Upload the file to the repository
|
57 |
+
api.upload_file(
|
58 |
+
path_or_fileobj=jsonl_file_path,
|
59 |
+
path_in_repo="data.jsonl",
|
60 |
+
repo_id=repo_id,
|
61 |
+
repo_type="dataset"
|
62 |
+
)
|
63 |
+
logger.info(f"Data pushed to {repo_id}")
|
64 |
|
65 |
+
# Gradio interface
|
66 |
+
def generate_jsonl(text):
|
67 |
+
jsonl_data = text_to_jsonl(text)
|
68 |
+
push_to_huggingface(jsonl_data)
|
69 |
+
return "Data processed and pushed to Hugging Face"
|
70 |
|
71 |
+
# Define Gradio interface
|
72 |
+
def gradio_interface():
|
73 |
+
with gr.Blocks() as demo:
|
74 |
+
gr.Markdown("# Text to JSONL Converter and Hugging Face Pusher")
|
75 |
+
|
76 |
+
with gr.Tab("About"):
|
77 |
+
gr.Markdown("""
|
78 |
+
## About
|
79 |
+
This tool converts text into JSONL format with detailed information about each word, including its tokenizer and meaning.
|
80 |
+
It then pushes the processed data to a Hugging Face dataset repository.
|
81 |
+
""")
|
82 |
+
|
83 |
+
with gr.Tab("Generate"):
|
84 |
+
with gr.Row():
|
85 |
+
input_text = gr.Textbox(label="Input Text", lines=5)
|
86 |
+
output_text = gr.Textbox(label="Output Status", lines=1)
|
87 |
+
|
88 |
+
generate_button = gr.Button("Generate and Push")
|
89 |
+
generate_button.click(fn=generate_jsonl, inputs=input_text, outputs=output_text)
|
90 |
|
91 |
+
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
+
# Run the Gradio interface
|
94 |
if __name__ == "__main__":
|
95 |
+
gradio_interface()
|
|
|
|
|
|