katsukiai commited on
Commit
fb2776a
·
verified ·
1 Parent(s): cdb8f5c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +131 -75
app.py CHANGED
@@ -1,94 +1,150 @@
 
1
  import os
2
  import csv
 
3
  import logging
4
  import gradio as gr
5
- import nltk
6
- from datasets import Dataset, DatasetDict, DatasetInfo, Features, Value, ClassLabel
7
- from huggingface_hub import HfApi, Repository, create_repo
8
  from tqdm import tqdm
 
9
  from nltk.tokenize import word_tokenize
10
- from nltk.corpus import wordnet as wn
11
- import random
12
- import string
 
 
 
 
13
 
14
- # Ensure necessary NLTK resources are downloaded
15
  nltk.download('all')
16
- #nltk.download('wordnet')
17
 
18
- # Set up logging
19
- logging.basicConfig(level=logging.INFO)
20
- logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- # Function to generate random words
23
- def generate_random_words(num_words=100):
24
- words = []
25
- for _ in range(num_words):
26
- word_length = random.randint(3, 10)
27
- word = ''.join(random.choices(string.ascii_lowercase, k=word_length))
28
- words.append(word)
29
- return words
30
 
31
- # Function to get meanings of words using NLTK WordNet
32
- def get_word_meanings(words):
33
- meanings = {}
34
- for word in words:
35
- synsets = wn.synsets(word)
36
- if synsets:
37
- meanings[word] = synsets[0].definition()
38
- else:
39
- meanings[word] = "No definition found."
40
- return meanings
41
 
42
- # Function to convert data to CSV format
43
- def convert_to_csv(data, filename='dataset.csv'):
44
- fieldnames = ['word', 'meaning']
45
- with open(filename, mode='w', newline='', encoding='utf-8') as file:
46
- writer = csv.DictWriter(file, fieldnames=fieldnames)
47
- writer.writeheader()
48
- for word, meaning in data.items():
49
- writer.writerow({'word': word, 'meaning': meaning})
50
 
51
- # Function to create and push dataset to Hugging Face
52
- def create_and_push_dataset(csv_file='dataset.csv', repo_name='DeepFocus-X3'):
53
- # Create a new dataset repository on Hugging Face
54
- create_repo(repo_name, exist_ok=True)
55
- api = HfApi()
56
- api.upload_file(
57
- path_or_fileobj=csv_file,
58
- path_in_repo=csv_file,
59
- repo_id=repo_name,
60
- repo_type='dataset'
61
- )
62
- logger.info(f"Dataset {repo_name} created and file {csv_file} uploaded.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
- # Gradio interface functions
65
- def generate_words_interface():
66
- num_words = random.randint(50, 200)
67
- words = generate_random_words(num_words)
68
- meanings = get_word_meanings(words)
69
- convert_to_csv(meanings)
70
- return f"Generated {num_words} random words and saved to dataset.csv."
 
71
 
72
- def about_interface():
73
- return "This is a dataset generation tool that creates a dataset of random words and their meanings, then uploads it to Hugging Face."
 
 
 
 
 
 
74
 
75
- def logs_interface():
76
- with open('dataset_generation.log', 'r') as file:
77
- logs = file.read()
78
- return logs
 
 
 
 
 
 
 
79
 
80
- # Gradio app setup
81
- with gr.Blocks() as demo:
 
 
82
  with gr.Tabs():
83
- with gr.Tab("About"):
84
- about_text = gr.Markdown(about_interface)
85
- with gr.Tab("Generate"):
86
- generate_button = gr.Button("Generate Dataset")
87
- generate_output = gr.Textbox()
88
- generate_button.click(generate_words_interface, outputs=generate_output)
89
- with gr.Tab("Logs"):
90
- logs_output = gr.Textbox(value=logs_interface(), interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
- # Run the Gradio app
93
- if __name__ == "__main__":
94
- demo.launch()
 
1
+
2
  import os
3
  import csv
4
+ import json
5
  import logging
6
  import gradio as gr
 
 
 
7
  from tqdm import tqdm
8
+ import nltk
9
  from nltk.tokenize import word_tokenize
10
+ from nltk.corpus import wordnet
11
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
12
+ from huggingface_hub import HfApi, Repository, login
13
+ from datasets import Dataset
14
+ import pandas as pd
15
+ from datetime import datetime
16
+ import secrets
17
 
18
+ # Download all NLTK data
19
  nltk.download('all')
 
20
 
21
+ # Setup logging
22
+ log_dir = "logs"
23
+ os.makedirs(log_dir, exist_ok=True)
24
+ logging.basicConfig(
25
+ filename=os.path.join(log_dir, f"app_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"),
26
+ level=logging.INFO,
27
+ format='%(asctime)s - %(levelname)s - %(message)s'
28
+ )
29
+
30
+ # Error logging to Hugging Face
31
+ error_dir = "errors"
32
+ os.makedirs(error_dir, exist_ok=True)
33
+ error_log_file = os.path.join(error_dir, f"errors_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
34
+
35
+ def log_error(error_msg):
36
+ with open(error_log_file, 'a') as f:
37
+ f.write(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - ERROR - {error_msg}\n")
38
+ try:
39
+ api = HfApi()
40
+ api.upload_file(
41
+ path_or_fileobj=error_log_file,
42
+ path_in_repo=f"errors_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log",
43
+ repo_id="katsukiai/errors",
44
+ repo_type="dataset"
45
+ )
46
+ except Exception as e:
47
+ logging.error(f"Failed to upload error log: {str(e)}")
48
 
49
+ # Load Hugging Face models (300+ models available, using DeepSeek for long text)
50
+ tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-6.7b-instruct")
51
+ model = AutoModelForSeq2SeqLM.from_pretrained("deepseek-ai/deepseek-coder-6.7b-instruct")
52
+ meaning_generator = pipeline("text2text-generation", model="google/flan-t5-large")
 
 
 
 
53
 
54
+ # Hugging Face login
55
+ HF_TOKEN = os.getenv("HF_TOKEN", secrets.token_hex(16))
56
+ login(token=HF_TOKEN)
 
 
 
 
 
 
 
57
 
58
+ # Dataset preparation
59
+ dataset_dir = "dataset"
60
+ os.makedirs(dataset_dir, exist_ok=True)
61
+ csv_file = os.path.join(dataset_dir, "deepfocus_data.csv")
 
 
 
 
62
 
63
+ def process_text_to_csv(input_text):
64
+ try:
65
+ tokens = word_tokenize(input_text.lower())
66
+ words = list(set(tokens))
67
+ data = []
68
+ for word in tqdm(words, desc="Processing words"):
69
+ meanings = []
70
+ synsets = wordnet.synsets(word)
71
+ if synsets:
72
+ meanings = [syn.definition() for syn in synsets[:3]]
73
+ else:
74
+ try:
75
+ generated_meaning = meaning_generator(f"Define the word '{word}'", max_length=100)[0]['generated_text']
76
+ meanings.append(generated_meaning)
77
+ except Exception as e:
78
+ log_error(f"Meaning generation failed for '{word}': {str(e)}")
79
+ data.append({"tokenizer": tokens, "words": word, "meaning": meanings})
80
+
81
+ # Save to CSV
82
+ with open(csv_file, 'w', newline='', encoding='utf-8') as f:
83
+ writer = csv.DictWriter(f, fieldnames=["tokenizer", "words", "meaning"])
84
+ writer.writeheader()
85
+ writer.writerows(data)
86
+
87
+ logging.info(f"Dataset saved to {csv_file}")
88
+ return data
89
+ except Exception as e:
90
+ log_error(f"Error in process_text_to_csv: {str(e)}")
91
+ raise
92
 
93
+ def upload_to_huggingface():
94
+ try:
95
+ dataset = Dataset.from_csv(csv_file)
96
+ dataset.push_to_hub("katsukiai/DeepFocus-X3", token=HF_TOKEN)
97
+ logging.info("Dataset uploaded to Hugging Face")
98
+ except Exception as e:
99
+ log_error(f"Error uploading to Hugging Face: {str(e)}")
100
+ raise
101
 
102
+ def generate_output(input_text):
103
+ try:
104
+ data = process_text_to_csv(input_text)
105
+ upload_to_huggingface()
106
+ return json.dumps(data, indent=2)
107
+ except Exception as e:
108
+ log_error(f"Error in generate_output: {str(e)}")
109
+ return f"Error: {str(e)}"
110
 
111
+ def view_logs():
112
+ try:
113
+ log_files = os.listdir(log_dir)
114
+ log_content = ""
115
+ for log_file in log_files:
116
+ with open(os.path.join(log_dir, log_file), 'r') as f:
117
+ log_content += f"\n\n--- {log_file} ---\n\n{f.read()}"
118
+ return log_content
119
+ except Exception as e:
120
+ log_error(f"Error in view_logs: {str(e)}")
121
+ return f"Error: {str(e)}"
122
 
123
+ # Gradio Interface
124
+ with gr.Blocks(title="DeepFocus-X3") as demo:
125
+ gr.Markdown("# DeepFocus-X3")
126
+
127
  with gr.Tabs():
128
+ with gr.TabItem("About"):
129
+ gr.Markdown("""
130
+ ## About DeepFocus-X3
131
+ This application processes text, tokenizes it, extracts unique words, generates meanings, and uploads the dataset to Hugging Face.
132
+ - Uses NLTK for tokenization and WordNet for meanings.
133
+ - Leverages DeepSeek AI for long text processing and Google FLAN-T5 for meaning generation.
134
+ - Logs all activities and errors, with error logs uploaded to Hugging Face.
135
+ """)
136
+
137
+ with gr.TabItem("Generate all"):
138
+ input_text = gr.Textbox(label="Input Text", lines=10)
139
+ output_json = gr.Textbox(label="Output JSON", lines=10)
140
+ generate_btn = gr.Button("Generate and Upload")
141
+ generate_btn.click(fn=generate_output, inputs=input_text, outputs=output_json)
142
+
143
+ with gr.TabItem("Logs"):
144
+ gr.Markdown("## Report using Logs")
145
+ log_output = gr.Textbox(label="Log Content", lines=20)
146
+ view_logs_btn = gr.Button("View Logs")
147
+ view_logs_btn.click(fn=view_logs, inputs=None, outputs=log_output)
148
 
149
+ # Launch Gradio app
150
+ demo.launch()