katsukiai commited on
Commit
ba51acd
·
verified ·
1 Parent(s): ad7dde5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -82
app.py CHANGED
@@ -2,105 +2,94 @@ import os
2
  import json
3
  import logging
4
  import nltk
5
- from nltk import word_tokenize, pos_tag
6
  from tqdm import tqdm
7
- import streamlit as st
8
- from transformers import AutoTokenizer, AutoModelForCausalLM
9
- from datasets import Dataset
10
- from huggingface_hub import HfApi
11
- import shutil
12
- import torch
13
 
14
- # Setup environment and logging
15
- os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN", "your_hf_token_here")
16
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
17
- logger = logging.getLogger(__name__)
18
 
19
  # Download NLTK data
20
  nltk.download('punkt')
21
  nltk.download('averaged_perceptron_tagger')
 
22
 
23
- # Load DeepSeek-V3 model and tokenizer
24
- model_name = "deepseek-ai/DeepSeek-V3" # Updated to V3
25
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
26
- model = AutoModelForCausalLM.from_pretrained(
27
- model_name,
28
- trust_remote_code=True,
29
- torch_dtype=torch.float32, # CPU compatibility
30
- device_map="cpu"
31
- )
32
 
33
- # Paths
34
- converted_dir = "converted/"
35
- os.makedirs(converted_dir, exist_ok=True)
 
36
 
37
- # Training dataset preparation
38
- def prepare_dataset(text_data):
39
- logger.info("Preparing dataset...")
40
- dataset = []
41
- for text in tqdm(text_data.split('\n'), desc="Tokenizing"):
42
- if text.strip():
43
- tokens = word_tokenize(text)
44
- tagged = pos_tag(tokens)
45
- words = [word for word, _ in tagged]
46
- means = [tag for _, tag in tagged]
47
- dataset.append({"tokenizer": tokens, "words": words, "meaning": means})
48
- return dataset
49
 
50
- # Convert to JSONL
51
- def convert_to_jsonl(dataset, output_file):
52
- logger.info(f"Converting to JSONL: {output_file}")
53
- with open(output_file, 'w') as f:
54
- for entry in tqdm(dataset, desc="Writing JSONL"):
55
- f.write(json.dumps(entry) + '\n')
56
 
57
- # Push to HuggingFace
58
- def push_to_hf(dataset_path):
59
- logger.info("Pushing to HuggingFace dataset: katsukiai/DeepFocus-X3")
60
  api = HfApi()
61
- dataset = Dataset.from_json(dataset_path)
62
- dataset.push_to_hub("katsukiai/DeepFocus-X3", token=os.environ["HF_TOKEN"])
63
- logger.info("Dataset pushed successfully")
64
 
65
- # Generate text using DeepSeek-V3
66
- def generate_text(input_text):
67
- inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024).to("cpu")
68
- outputs = model.generate(**inputs, max_length=2048, num_return_sequences=1)
69
- return tokenizer.decode(outputs[0], skip_special_tokens=True)
70
 
71
- # Streamlit conversion function
72
- def convert_text(text):
73
- logger.info("Processing text with Streamlit...")
74
- long_text = generate_text(text) if len(text) > 100 else text
75
- dataset = prepare_dataset(long_text)
76
- output_file = os.path.join(converted_dir, "output.jsonl")
77
- convert_to_jsonl(dataset, output_file)
78
- push_to_hf(output_file)
79
- return json.dumps(dataset, indent=2)
80
 
81
- # Streamlit Interface
82
- def main():
83
- st.title("Text to JSON Converter")
 
 
 
 
 
84
 
85
- # Tabs using Streamlit expander
86
- tab = st.sidebar.selectbox("Select Tab", ["About", "Generate all"])
 
 
 
87
 
88
- if tab == "About":
89
- st.markdown("""
90
- This tool converts text to JSONL format using NLTK for tokenization and DeepSeek-V3 for long text generation.
91
- The output is saved in 'converted/' folder and pushed to HuggingFace dataset 'katsukiai/DeepFocus-X3'.
92
- Format: {"tokenizer": tokens, "words": words, "meaning": means}
93
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
- elif tab == "Generate all":
96
- text_input = st.text_area("Input Text", height=200)
97
- if st.button("Convert & Push"):
98
- with st.spinner("Processing..."):
99
- result = convert_text(text_input)
100
- st.text_area("JSON Output", value=result, height=200)
101
 
 
102
  if __name__ == "__main__":
103
- main()
104
-
105
- # Cleanup (optional)
106
- shutil.rmtree(converted_dir, ignore_errors=True)
 
2
  import json
3
  import logging
4
  import nltk
 
5
  from tqdm import tqdm
6
+ import gradio as gr
7
+ from transformers import pipeline
 
 
 
 
8
 
9
+ # Setup logging
10
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
11
+ logger = logging.getLogger()
 
12
 
13
  # Download NLTK data
14
  nltk.download('punkt')
15
  nltk.download('averaged_perceptron_tagger')
16
+ nltk.download('wordnet')
17
 
18
+ # Initialize DeepSeek AI pipeline for long text processing
19
+ deepseek_pipeline = pipeline('text-generation', model='DeepSeekAI/DeepFocus-X3')
 
 
 
 
 
 
 
20
 
21
+ # Function to process text and convert to JSONL
22
+ def text_to_jsonl(text):
23
+ sentences = nltk.sent_tokenize(text)
24
+ jsonl_data = []
25
 
26
+ for sentence in tqdm(sentences, desc="Processing sentences"):
27
+ words = nltk.word_tokenize(sentence)
28
+ pos_tags = nltk.pos_tag(words)
29
+ meanings = [nltk.corpus.wordnet.synsets(word)[0].definition() if nltk.corpus.wordnet.synsets(word) else 'No definition' for word in words]
30
+
31
+ jsonl_data.append({
32
+ "tokenizer": pos_tags,
33
+ "words": words,
34
+ "meaning": meanings
35
+ })
36
+
37
+ return jsonl_data
38
 
39
+ # Function to push data to Hugging Face dataset repository
40
+ def push_to_huggingface(jsonl_data, repo_name='katsukiai/DeepFocus-X3'):
41
+ import huggingface_hub
42
+ from huggingface_hub import HfApi, DatasetCard
 
 
43
 
 
 
 
44
  api = HfApi()
45
+ repo_id = f"{repo_name}"
 
 
46
 
47
+ # Create or get the dataset repository
48
+ api.create_repo(repo_id, repo_type="dataset", private=False, exist_ok=True)
 
 
 
49
 
50
+ # Write JSONL data to a file
51
+ jsonl_file_path = "data.jsonl"
52
+ with open(jsonl_file_path, "w") as f:
53
+ for item in jsonl_data:
54
+ f.write(json.dumps(item) + "\n")
 
 
 
 
55
 
56
+ # Upload the file to the repository
57
+ api.upload_file(
58
+ path_or_fileobj=jsonl_file_path,
59
+ path_in_repo="data.jsonl",
60
+ repo_id=repo_id,
61
+ repo_type="dataset"
62
+ )
63
+ logger.info(f"Data pushed to {repo_id}")
64
 
65
+ # Gradio interface
66
+ def generate_jsonl(text):
67
+ jsonl_data = text_to_jsonl(text)
68
+ push_to_huggingface(jsonl_data)
69
+ return "Data processed and pushed to Hugging Face"
70
 
71
+ # Define Gradio interface
72
+ def gradio_interface():
73
+ with gr.Blocks() as demo:
74
+ gr.Markdown("# Text to JSONL Converter and Hugging Face Pusher")
75
+
76
+ with gr.Tab("About"):
77
+ gr.Markdown("""
78
+ ## About
79
+ This tool converts text into JSONL format with detailed information about each word, including its tokenizer and meaning.
80
+ It then pushes the processed data to a Hugging Face dataset repository.
81
+ """)
82
+
83
+ with gr.Tab("Generate"):
84
+ with gr.Row():
85
+ input_text = gr.Textbox(label="Input Text", lines=5)
86
+ output_text = gr.Textbox(label="Output Status", lines=1)
87
+
88
+ generate_button = gr.Button("Generate and Push")
89
+ generate_button.click(fn=generate_jsonl, inputs=input_text, outputs=output_text)
90
 
91
+ demo.launch()
 
 
 
 
 
92
 
93
+ # Run the Gradio interface
94
  if __name__ == "__main__":
95
+ gradio_interface()