Spaces:

universeofml
/

DeepFocusTrain

Runtime error

App Files Files Community

katsukiai commited on Mar 5

Commit

ba51acd

verified ·

1 Parent(s): ad7dde5

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -82

app.py CHANGED Viewed

@@ -2,105 +2,94 @@ import os
 import json
 import logging
 import nltk
-from nltk import word_tokenize, pos_tag
 from tqdm import tqdm
-import streamlit as st
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from datasets import Dataset
-from huggingface_hub import HfApi
-import shutil
-import torch
-# Setup environment and logging
-os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN", "your_hf_token_here")
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
-logger = logging.getLogger(__name__)
 # Download NLTK data
 nltk.download('punkt')
 nltk.download('averaged_perceptron_tagger')
-# Load DeepSeek-V3 model and tokenizer
-model_name = "deepseek-ai/DeepSeek-V3"  # Updated to V3
-tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    trust_remote_code=True,
-    torch_dtype=torch.float32,  # CPU compatibility
-    device_map="cpu"
-)
-# Paths
-converted_dir = "converted/"
-os.makedirs(converted_dir, exist_ok=True)
-# Training dataset preparation
-def prepare_dataset(text_data):
-    logger.info("Preparing dataset...")
-    dataset = []
-    for text in tqdm(text_data.split('\n'), desc="Tokenizing"):
-        if text.strip():
-            tokens = word_tokenize(text)
-            tagged = pos_tag(tokens)
-            words = [word for word, _ in tagged]
-            means = [tag for _, tag in tagged]
-            dataset.append({"tokenizer": tokens, "words": words, "meaning": means})
-    return dataset
-# Convert to JSONL
-def convert_to_jsonl(dataset, output_file):
-    logger.info(f"Converting to JSONL: {output_file}")
-    with open(output_file, 'w') as f:
-        for entry in tqdm(dataset, desc="Writing JSONL"):
-            f.write(json.dumps(entry) + '\n')
-# Push to HuggingFace
-def push_to_hf(dataset_path):
-    logger.info("Pushing to HuggingFace dataset: katsukiai/DeepFocus-X3")
     api = HfApi()
-    dataset = Dataset.from_json(dataset_path)
-    dataset.push_to_hub("katsukiai/DeepFocus-X3", token=os.environ["HF_TOKEN"])
-    logger.info("Dataset pushed successfully")
-# Generate text using DeepSeek-V3
-def generate_text(input_text):
-    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024).to("cpu")
-    outputs = model.generate(**inputs, max_length=2048, num_return_sequences=1)
-    return tokenizer.decode(outputs[0], skip_special_tokens=True)
-# Streamlit conversion function
-def convert_text(text):
-    logger.info("Processing text with Streamlit...")
-    long_text = generate_text(text) if len(text) > 100 else text
-    dataset = prepare_dataset(long_text)
-    output_file = os.path.join(converted_dir, "output.jsonl")
-    convert_to_jsonl(dataset, output_file)
-    push_to_hf(output_file)
-    return json.dumps(dataset, indent=2)
-# Streamlit Interface
-def main():
-    st.title("Text to JSON Converter")
-    # Tabs using Streamlit expander
-    tab = st.sidebar.selectbox("Select Tab", ["About", "Generate all"])
-    if tab == "About":
-        st.markdown("""
-        This tool converts text to JSONL format using NLTK for tokenization and DeepSeek-V3 for long text generation.
-        The output is saved in 'converted/' folder and pushed to HuggingFace dataset 'katsukiai/DeepFocus-X3'.
-        Format: {"tokenizer": tokens, "words": words, "meaning": means}
-        """)
-    elif tab == "Generate all":
-        text_input = st.text_area("Input Text", height=200)
-        if st.button("Convert & Push"):
-            with st.spinner("Processing..."):
-                result = convert_text(text_input)
-                st.text_area("JSON Output", value=result, height=200)
 if __name__ == "__main__":
-    main()
-# Cleanup (optional)
-shutil.rmtree(converted_dir, ignore_errors=True)

 import json
 import logging
 import nltk
 from tqdm import tqdm
+import gradio as gr
+from transformers import pipeline
+# Setup logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger()
 # Download NLTK data
 nltk.download('punkt')
 nltk.download('averaged_perceptron_tagger')
+nltk.download('wordnet')
+# Initialize DeepSeek AI pipeline for long text processing
+deepseek_pipeline = pipeline('text-generation', model='DeepSeekAI/DeepFocus-X3')
+# Function to process text and convert to JSONL
+def text_to_jsonl(text):
+    sentences = nltk.sent_tokenize(text)
+    jsonl_data = []
+    for sentence in tqdm(sentences, desc="Processing sentences"):
+        words = nltk.word_tokenize(sentence)
+        pos_tags = nltk.pos_tag(words)
+        meanings = [nltk.corpus.wordnet.synsets(word)[0].definition() if nltk.corpus.wordnet.synsets(word) else 'No definition' for word in words]
+        jsonl_data.append({
+            "tokenizer": pos_tags,
+            "words": words,
+            "meaning": meanings
+        })
+    return jsonl_data
+# Function to push data to Hugging Face dataset repository
+def push_to_huggingface(jsonl_data, repo_name='katsukiai/DeepFocus-X3'):
+    import huggingface_hub
+    from huggingface_hub import HfApi, DatasetCard
     api = HfApi()
+    repo_id = f"{repo_name}"
+    # Create or get the dataset repository
+    api.create_repo(repo_id, repo_type="dataset", private=False, exist_ok=True)
+    # Write JSONL data to a file
+    jsonl_file_path = "data.jsonl"
+    with open(jsonl_file_path, "w") as f:
+        for item in jsonl_data:
+            f.write(json.dumps(item) + "\n")
+    # Upload the file to the repository
+    api.upload_file(
+        path_or_fileobj=jsonl_file_path,
+        path_in_repo="data.jsonl",
+        repo_id=repo_id,
+        repo_type="dataset"
+    )
+    logger.info(f"Data pushed to {repo_id}")
+# Gradio interface
+def generate_jsonl(text):
+    jsonl_data = text_to_jsonl(text)
+    push_to_huggingface(jsonl_data)
+    return "Data processed and pushed to Hugging Face"
+# Define Gradio interface
+def gradio_interface():
+    with gr.Blocks() as demo:
+        gr.Markdown("# Text to JSONL Converter and Hugging Face Pusher")
+        with gr.Tab("About"):
+            gr.Markdown("""
+                ## About
+                This tool converts text into JSONL format with detailed information about each word, including its tokenizer and meaning.
+                It then pushes the processed data to a Hugging Face dataset repository.
+            """)
+        with gr.Tab("Generate"):
+            with gr.Row():
+                input_text = gr.Textbox(label="Input Text", lines=5)
+                output_text = gr.Textbox(label="Output Status", lines=1)
+            generate_button = gr.Button("Generate and Push")
+            generate_button.click(fn=generate_jsonl, inputs=input_text, outputs=output_text)
+    demo.launch()
+# Run the Gradio interface
 if __name__ == "__main__":
+    gradio_interface()