RAGOndevice

Running on Zero

File size: 26,981 Bytes

3db2af2
e6c14df
 
3db2af2
 
e6c14df
 
 
55caecd
e6c14df
 
 
 
1cfe513
e6c14df
1be852d
e6c14df
3e45a0e
 
e6c14df
3db2af2
55caecd
e6c14df
 
9e9b867
1c47184
9e9b867
3c893d2
1c47184
9e9b867
58e272a
50ef49c
0223744
 
 
3c893d2
 
 
 
 
 
 
55caecd
 
3c893d2
e6c14df
1be852d
 
 
 
 
 
 
 
 
 
 
 
 
 
3db2af2
 
 
3c893d2
2317674
0223744
9e9b867
0cdbe8f
9e9b867
 
e6c14df
9a66aa0
9e9b867
e6c14df
58e272a
 
9e9b867
55caecd
50ef49c
55caecd
50ef49c
 
 
9e9b867
e6c14df
0223744
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6c14df
58e272a
50ef49c
 
58e272a
e6c14df
58e272a
 
0223744
50ef49c
 
 
 
 
58e272a
9e9b867
fcd720a
3db2af2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6c14df
3db2af2
 
 
 
 
 
 
 
 
 
 
e6c14df
0223744
 
 
 
 
 
 
fcd720a
0223744
fcd720a
3db2af2
0223744
 
 
3db2af2
0223744
 
 
 
fcd720a
3db2af2
0223744
 
fcd720a
6360699
6adfca3
 
 
e6c14df
55caecd
 
 
 
6adfca3
 
3db2af2
6adfca3
3c893d2
 
 
3db2af2
fcd720a
 
 
 
e6c14df
 
3db2af2
fcd720a
3c893d2
 
3db2af2
fcd720a
3c893d2
3db2af2
fcd720a
3c893d2
 
e6c14df
 
3db2af2
3c893d2
 
fcd720a
3c893d2
 
3db2af2
3c893d2
 
fcd720a
3db2af2
55caecd
3c893d2
3db2af2
 
 
 
 
 
 
 
 
 
55caecd
3c893d2
 
fcd720a
3db2af2
6adfca3
 
 
 
 
fcd720a
 
 
 
e6c14df
 
3db2af2
fcd720a
3c893d2
 
3db2af2
fcd720a
3c893d2
3db2af2
fcd720a
6adfca3
3c893d2
e6c14df
 
3db2af2
6adfca3
 
 
e6c14df
 
 
3db2af2
3c893d2
6adfca3
 
 
 
 
3db2af2
3c893d2
 
 
e6c14df
 
 
 
3db2af2
55caecd
3c893d2
e6c14df
 
 
 
 
 
fcd720a
 
 
 
 
3c893d2
 
 
fcd720a
 
 
 
 
3db2af2
3c893d2
e6c14df
6adfca3
 
e6c14df
 
 
 
3db2af2
6adfca3
fcd720a
6adfca3
e6c14df
b03b509
55caecd
b03b509
 
3c893d2
 
 
 
 
e6c14df
3c893d2
 
 
9affa6d
3db2af2
3c893d2
 
 
9affa6d
3c893d2
55caecd
 
3db2af2
3c893d2
 
 
 
3db2af2
 
 
 
 
 
 
 
 
 
e6c14df
e468070
3db2af2
 
 
 
 
 
 
 
 
 
3c893d2
3db2af2
e468070
3c893d2
 
3db2af2
e6c14df
 
3c893d2
55caecd
e468070
 
3db2af2
e468070
 
 
 
3db2af2
 
 
 
58d9d19
e468070
 
e6c14df
e468070
3db2af2
3c893d2
 
55caecd
9affa6d
e468070
 
e6c14df
9affa6d
 
 
 
 
 
 
e468070
e6c14df
 
55caecd
e6c14df
 
 
3db2af2
e468070
 
 
 
 
 
 
55caecd
9affa6d
 
 
 
 
 
 
e6c14df
e468070
 
55caecd
3db2af2
 
9affa6d
 
e6c14df
55caecd
9affa6d
e6c14df
9affa6d
58d9d19
9affa6d
 
 
 
 
 
e6c14df
 
9affa6d
e6c14df
55caecd
9affa6d
3db2af2
e6c14df
3db2af2
 
55caecd
3db2af2
4dd6e62
3db2af2
e6c14df
55caecd
e468070
3db2af2
e468070
 
 
 
3db2af2
 
e468070
55caecd
4dd6e62
55caecd
e468070
3db2af2
3c893d2
3db2af2
55caecd
e468070
 
 
55caecd
e468070
fca6281
 
e6c14df
58d9d19
 
e6c14df
 
 
55caecd
e6c14df
 
 
 
 
 
55caecd
fca6281
 
e6c14df
55caecd
fca6281
e6c14df
fca6281
e6c14df
55caecd
e6c14df
e468070
e6c14df
9affa6d
 
e6c14df
3c893d2
 
e468070
1be852d
 
 
e6c14df
3c893d2
e468070
 
e6c14df
7773cb1
 
3c893d2
 
5f4c99e
d6a3ccb
 
3c893d2
3db2af2
c53bcba
 
 
 
 
 
3db2af2
c53bcba
 
 
 
 
 
 
 
 
 
3db2af2
3c893d2
c53bcba
 
d6a3ccb
c53bcba
 
 
 
3db2af2
c53bcba
 
d6a3ccb
c53bcba
 
 
3db2af2
3c893d2
 
 
 
 
 
3db2af2
e6c14df
d6a3ccb
c53bcba
 
 
 
d6a3ccb
c53bcba
 
 
d6a3ccb
0b0ac38
c53bcba
 
 
d6a3ccb
0b0ac38
c53bcba
 
d6a3ccb
b03b509
c53bcba
 
d6a3ccb
c53bcba
 
55caecd
c53bcba
 
d6a3ccb
 
 
 
c53bcba
 
 
 
e6c14df
3c893d2
 
 
d6a3ccb
3c893d2
55caecd
c53bcba
 
 
 
 
 
 
 
 
 
e468070
e6c14df
e468070
9affa6d
 
 
 
e468070
 
 
 
 
 
 
 
 
 
e6c14df
3c893d2
 
 
 
 
 
7773cb1
2317674
 
7773cb1
e6c14df

import os

# 1) Dynamo 완전 비활성화
os.environ["TORCH_DYNAMO_DISABLE"] = "1"

# 2) Triton의 cudagraphs 최적화 비활성화
os.environ["TRITON_DISABLE_CUDAGRAPHS"] = "1"

# (옵션) 경고 무시 설정
import warnings
warnings.filterwarnings("ignore", message="skipping cudagraphs due to mutated inputs")
warnings.filterwarnings("ignore", message="Not enough SMs to use max_autotune_gemm mode")

import torch
# TensorFloat32 연산 활성화 (성능 최적화)
torch.set_float32_matmul_precision('high')

import torch._inductor
torch._inductor.config.triton.cudagraphs = False

import torch._dynamo
# suppress_errors (오류 시 eager로 fallback)
torch._dynamo.config.suppress_errors = True

import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

from threading import Thread
from datasets import load_dataset
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import json
from datetime import datetime
import pyarrow.parquet as pq
import pypdf
import io
import platform
import subprocess
import pytesseract
from pdf2image import convert_from_path
import queue
import time

# -------------------- PDF to Markdown 변환 관련 import --------------------
try:
    import re
    import requests
    from bs4 import BeautifulSoup
    import urllib.request
    import ocrmypdf
    import pytz
    import urllib.parse
    from pypdf import PdfReader
except ModuleNotFoundError as e:
    raise ModuleNotFoundError(
        "필수 모듈이 누락되었습니다. 'beautifulsoup4' 패키지를 설치해주세요.\n"
        "예: pip install beautifulsoup4"
    )
# ---------------------------------------------------------------------------

# 전역 변수
current_file_context = None

# 환경 변수 설정
HF_TOKEN = os.environ.get("HF_TOKEN", None)
MODEL_ID = "CohereForAI/c4ai-command-r7b-12-2024"
MODEL_NAME = MODEL_ID.split("/")[-1]

model = None  # 전역에서 관리
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# (1) 위키피디아 데이터셋 로드
wiki_dataset = load_dataset("lcw99/wikipedia-korean-20240501-1million-qna")
print("Wikipedia dataset loaded:", wiki_dataset)

# (2) TF-IDF 벡터라이저 초기화 및 학습 (일부만 사용)
print("TF-IDF 벡터화 시작...")
questions = wiki_dataset['train']['question'][:10000]
vectorizer = TfidfVectorizer(max_features=1000)
question_vectors = vectorizer.fit_transform(questions)
print("TF-IDF 벡터화 완료")

# ------------------------- ChatHistory 클래스 -------------------------
class ChatHistory:
    def __init__(self):
        self.history = []
        self.history_file = "/tmp/chat_history.json"
        self.load_history()

    def add_conversation(self, user_msg: str, assistant_msg: str):
        conversation = {
            "timestamp": datetime.now().isoformat(),
            "messages": [
                {"role": "user", "content": user_msg},
                {"role": "assistant", "content": assistant_msg}
            ]
        }
        self.history.append(conversation)
        self.save_history()

    def format_for_display(self):
        formatted = []
        for conv in self.history:
            formatted.append([
                conv["messages"][0]["content"],
                conv["messages"][1]["content"]
            ])
        return formatted

    def get_messages_for_api(self):
        messages = []
        for conv in self.history:
            messages.extend([
                {"role": "user", "content": conv["messages"][0]["content"]},
                {"role": "assistant", "content": conv["messages"][1]["content"]}
            ])
        return messages

    def clear_history(self):
        self.history = []
        self.save_history()

    def save_history(self):
        try:
            with open(self.history_file, 'w', encoding='utf-8') as f:
                json.dump(self.history, f, ensure_ascii=False, indent=2)
        except Exception as e:
            print(f"히스토리 저장 실패: {e}")

    def load_history(self):
        try:
            if os.path.exists(self.history_file):
                with open(self.history_file, 'r', encoding='utf-8') as f:
                    self.history = json.load(f)
        except Exception as e:
            print(f"히스토리 로드 실패: {e}")
            self.history = []

chat_history = ChatHistory()

# ------------------------- 위키 문서 검색 (TF-IDF) -------------------------
def find_relevant_context(query, top_k=3):
    query_vector = vectorizer.transform([query])
    similarities = (query_vector * question_vectors.T).toarray()[0]
    top_indices = np.argsort(similarities)[-top_k:][::-1]

    relevant_contexts = []
    for idx in top_indices:
        if similarities[idx] > 0:
            relevant_contexts.append({
                'question': questions[idx],
                'answer': wiki_dataset['train']['answer'][idx],
                'similarity': similarities[idx]
            })
    return relevant_contexts

def init_msg():
    return "파일을 분석하고 있습니다..."

# -------------------- PDF 파일을 Markdown으로 변환하는 유틸 함수들 --------------------
def extract_text_from_pdf(reader: PdfReader) -> str:
    full_text = ""
    for idx, page in enumerate(reader.pages):
        text = page.extract_text() or ""
        if len(text) > 0:
            full_text += f"---- Page {idx+1} ----\n" + text + "\n\n"
    return full_text.strip()

def convert_pdf_to_markdown(pdf_file: str):
    try:
        reader = PdfReader(pdf_file)
    except Exception as e:
        return f"PDF 파일을 읽는 중 오류 발생: {e}", None, None

    raw_meta = reader.metadata
    metadata = {
        "author": raw_meta.author if raw_meta else None,
        "creator": raw_meta.creator if raw_meta else None,
        "producer": raw_meta.producer if raw_meta else None,
        "subject": raw_meta.subject if raw_meta else None,
        "title": raw_meta.title if raw_meta else None,
    }

    full_text = extract_text_from_pdf(reader)

    image_count = sum(len(page.images) for page in reader.pages)
    if image_count > 0 and len(full_text) < 1000:
        try:
            out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
            ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)
            reader_ocr = PdfReader(out_pdf_file)
            full_text = extract_text_from_pdf(reader_ocr)
        except Exception as e:
            full_text = f"OCR 처리 중 오류 발생: {e}\n\n원본 PDF 텍스트:\n\n" + full_text

    return full_text, metadata, pdf_file

# ------------------------- 파일 분석 함수 -------------------------
def analyze_file_content(content, file_type):
    if file_type in ['parquet', 'csv']:
        try:
            lines = content.split('\n')
            header = lines[0]
            columns = header.count('|') - 1
            rows = len(lines) - 3
            return f"📊 Dataset Structure: {columns} columns, {rows} rows"
        except:
            return "❌ Failed to analyze dataset structure"

    lines = content.split('\n')
    total_lines = len(lines)
    non_empty_lines = len([line for line in lines if line.strip()])

    if any(keyword in content.lower() for keyword in ['def ', 'class ', 'import ', 'function']):
        functions = len([line for line in lines if 'def ' in line])
        classes = len([line for line in lines if 'class ' in line])
        imports = len([line for line in lines if 'import ' in line or 'from ' in line])
        return f"💻 Code Structure: {total_lines} lines (Functions: {functions}, Classes: {classes}, Imports: {imports})"

    paragraphs = content.count('\n\n') + 1
    words = len(content.split())
    return f"📝 Document Structure: {total_lines} lines, {paragraphs} paragraphs, approximately {words} words"

def read_uploaded_file(file):
    if file is None:
        return "", ""

    import pyarrow.parquet as pq
    import pandas as pd
    from tabulate import tabulate

    try:
        file_ext = os.path.splitext(file.name)[1].lower()

        if file_ext == '.parquet':
            try:
                table = pq.read_table(file.name)
                df = table.to_pandas()

                content = f"📊 Parquet File Analysis:\n\n"
                content += f"1. Basic Information:\n"
                content += f"- Total Rows: {len(df):,}\n"
                content += f"- Total Columns: {len(df.columns)}\n"
                mem_usage = df.memory_usage(deep=True).sum() / 1024 / 1024
                content += f"- Memory Usage: {mem_usage:.2f} MB\n\n"

                content += f"2. Column Information:\n"
                for col in df.columns:
                    content += f"- {col} ({df[col].dtype})\n"

                content += f"\n3. Data Preview:\n"
                content += tabulate(df.head(5), headers='keys', tablefmt='pipe', showindex=False)

                content += f"\n\n4. Missing Values:\n"
                null_counts = df.isnull().sum()
                for col, count in null_counts[null_counts > 0].items():
                    rate = count / len(df) * 100
                    content += f"- {col}: {count:,} ({rate:.1f}%)\n"

                numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
                if len(numeric_cols) > 0:
                    content += f"\n5. Numeric Column Statistics:\n"
                    stats_df = df[numeric_cols].describe()
                    content += tabulate(stats_df, headers='keys', tablefmt='pipe')

                return content, "parquet"
            except Exception as e:
                return f"Error reading Parquet file: {str(e)}", "error"

        elif file_ext == '.pdf':
            try:
                markdown_text, metadata, processed_pdf_path = convert_pdf_to_markdown(file.name)
                if metadata is None:
                    return f"PDF 파일 변환 오류 또는 읽기 실패.\n\n원본 메시지:\n{markdown_text}", "error"

                content = "# PDF to Markdown Conversion\n\n"
                content += "## Metadata\n"
                for k, v in metadata.items():
                    content += f"**{k.capitalize()}**: {v}\n\n"
                content += "## Extracted Text\n\n"
                content += markdown_text

                return content, "pdf"
            except Exception as e:
                return f"Error reading PDF file: {str(e)}", "error"

        elif file_ext == '.csv':
            encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
            for encoding in encodings:
                try:
                    df = pd.read_csv(file.name, encoding=encoding)
                    content = f"📊 CSV File Analysis:\n\n"
                    content += f"1. Basic Information:\n"
                    content += f"- Total Rows: {len(df):,}\n"
                    content += f"- Total Columns: {len(df.columns)}\n"
                    mem_usage = df.memory_usage(deep=True).sum() / 1024 / 1024
                    content += f"- Memory Usage: {mem_usage:.2f} MB\n\n"

                    content += f"2. Column Information:\n"
                    for col in df.columns:
                        content += f"- {col} ({df[col].dtype})\n"

                    content += f"\n3. Data Preview:\n"
                    content += df.head(5).to_markdown(index=False)

                    content += f"\n\n4. Missing Values:\n"
                    null_counts = df.isnull().sum()
                    for col, count in null_counts[null_counts > 0].items():
                        rate = count / len(df) * 100
                        content += f"- {col}: {count:,} ({rate:.1f}%)\n"

                    return content, "csv"
                except UnicodeDecodeError:
                    continue
            raise UnicodeDecodeError(
                f"Unable to read file with supported encodings ({', '.join(encodings)})"
            )

        else:
            encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
            for encoding in encodings:
                try:
                    with open(file.name, 'r', encoding=encoding) as f:
                        content = f.read()

                    lines = content.split('\n')
                    total_lines = len(lines)
                    non_empty_lines = len([line for line in lines if line.strip()])
                    is_code = any(
                        keyword in content.lower()
                        for keyword in ['def ', 'class ', 'import ', 'function']
                    )

                    analysis = "\n📝 File Analysis:\n"
                    if is_code:
                        functions = sum('def ' in line for line in lines)
                        classes = sum('class ' in line for line in lines)
                        imports = sum(
                            ('import ' in line) or ('from ' in line)
                            for line in lines
                        )
                        analysis += f"- File Type: Code\n"
                        analysis += f"- Total Lines: {total_lines:,}\n"
                        analysis += f"- Functions: {functions}\n"
                        analysis += f"- Classes: {classes}\n"
                        analysis += f"- Import Statements: {imports}\n"
                    else:
                        words = len(content.split())
                        chars = len(content)
                        analysis += f"- File Type: Text\n"
                        analysis += f"- Total Lines: {total_lines:,}\n"
                        analysis += f"- Non-empty Lines: {non_empty_lines:,}\n"
                        analysis += f"- Word Count: {words:,}\n"
                        analysis += f"- Character Count: {chars:,}\n"

                    return content + analysis, "text"

                except UnicodeDecodeError:
                    continue

            raise UnicodeDecodeError(
                f"Unable to read file with supported encodings ({', '.join(encodings)})"
            )

    except Exception as e:
        return f"Error reading file: {str(e)}", "error"

# ------------------------- CSS -------------------------
CSS = """
/* (생략: 동일) */
"""

def clear_cuda_memory():
    if hasattr(torch.cuda, 'empty_cache'):
        with torch.cuda.device('cuda'):
            torch.cuda.empty_cache()

# ------------------------- 모델 로딩 함수 -------------------------
@spaces.GPU
def load_model():
    try:
        clear_cuda_memory()
        loaded_model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            low_cpu_mem_usage=True,
        )
        # (중요) 모델 기본 config에서도 캐시 사용 꺼둘 수 있음
        loaded_model.config.use_cache = False
        return loaded_model
    except Exception as e:
        print(f"모델 로드 오류: {str(e)}")
        raise

def build_prompt(conversation: list) -> str:
    prompt = ""
    for msg in conversation:
        if msg["role"] == "user":
            prompt += "User: " + msg["content"] + "\n"
        elif msg["role"] == "assistant":
            prompt += "Assistant: " + msg["content"] + "\n"
    prompt += "Assistant: "
    return prompt

# ------------------------- 메시지 스트리밍 함수 -------------------------
@spaces.GPU
def stream_chat(
    message: str,
    history: list,
    uploaded_file,
    temperature: float,
    max_new_tokens: int,
    top_p: float,
    top_k: int,
    penalty: float
):
    global model, current_file_context

    try:
        if model is None:
            model = load_model()

        print(f'[User input] message: {message}')
        print(f'[History] {history}')

        # 1) 파일 업로드 처리
        file_context = ""
        if uploaded_file and message == "파일을 분석하고 있습니다...":
            current_file_context = None
            try:
                content, file_type = read_uploaded_file(uploaded_file)
                if content:
                    file_analysis = analyze_file_content(content, file_type)
                    file_context = (
                        f"\n\n📄 파일 분석 결과:\n{file_analysis}"
                        f"\n\n파일 내용:\n```\n{content}\n```"
                    )
                    current_file_context = file_context
                    message = "업로드된 파일을 분석해주세요."
            except Exception as e:
                print(f"[파일 분석 오류] {str(e)}")
                file_context = f"\n\n❌ 파일 분석 중 오류가 발생했습니다: {str(e)}"
        elif current_file_context:
            file_context = current_file_context

        # 2) 위키 컨텍스트
        wiki_context = ""
        try:
            relevant_contexts = find_relevant_context(message)
            if relevant_contexts:
                wiki_context = "\n\n관련 위키피디아 정보:\n"
                for ctx in relevant_contexts:
                    wiki_context += (
                        f"Q: {ctx['question']}\n"
                        f"A: {ctx['answer']}\n"
                        f"유사도: {ctx['similarity']:.3f}\n\n"
                    )
        except Exception as e:
            print(f"[컨텍스트 검색 오류] {str(e)}")

        # 3) 대화 이력 축소
        max_history_length = 10
        if len(history) > max_history_length:
            history = history[-max_history_length:]

        conversation = []
        for prompt, answer in history:
            conversation.extend([
                {"role": "user", "content": prompt},
                {"role": "assistant", "content": answer}
            ])

        # 4) 최종 메시지
        final_message = message
        if file_context:
            final_message = file_context + "\n현재 질문: " + message
        if wiki_context:
            final_message = wiki_context + "\n현재 질문: " + message
        if file_context and wiki_context:
            final_message = file_context + wiki_context + "\n현재 질문: " + message

        conversation.append({"role": "user", "content": final_message})

        # 5) 토큰화
        input_ids_str = build_prompt(conversation)
        max_context = 8192
        tokenized_input = tokenizer(input_ids_str, return_tensors="pt")
        input_length = tokenized_input["input_ids"].shape[1]

        # 6) 컨텍스트 초과 시 자르기
        if input_length > max_context - max_new_tokens:
            print(f"[경고] 입력이 너무 깁니다: {input_length} 토큰 -> 잘라냄.")
            min_generation = min(256, max_new_tokens)
            new_desired_input_length = max_context - min_generation
            tokens = tokenizer.encode(input_ids_str)
            if len(tokens) > new_desired_input_length:
                tokens = tokens[-new_desired_input_length:]
                input_ids_str = tokenizer.decode(tokens)
            tokenized_input = tokenizer(input_ids_str, return_tensors="pt")
            input_length = tokenized_input["input_ids"].shape[1]

        print(f"[토큰 길이] {input_length}")
        inputs = tokenized_input.to("cuda")

        # 7) 남은 토큰 수로 max_new_tokens 보정
        remaining = max_context - input_length
        if remaining < max_new_tokens:
            print(f"[max_new_tokens 조정] {max_new_tokens} -> {remaining}")
            max_new_tokens = remaining

        # 8) TextIteratorStreamer 설정
        streamer = TextIteratorStreamer(
            tokenizer, timeout=30.0, skip_prompt=True, skip_special_tokens=True
        )

        # ★ use_cache=False 설정 (중요) ★
        generate_kwargs = dict(
            **inputs,
            streamer=streamer,
            top_k=top_k,
            top_p=top_p,
            repetition_penalty=penalty,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            use_cache=False,   # ← 여기가 핵심!
        )

        clear_cuda_memory()

        # 9) 별도 스레드로 모델 호출
        thread = Thread(target=model.generate, kwargs=generate_kwargs)
        thread.start()

        # 10) 스트리밍
        buffer = ""
        partial_message = ""
        last_yield_time = time.time()

        try:
            for new_text in streamer:
                buffer += new_text
                partial_message += new_text

                # 타이밍 or 일정 길이마다 UI 업데이트
                current_time = time.time()
                if (current_time - last_yield_time > 0.1) or (len(partial_message) > 20):
                    yield "", history + [[message, buffer]]
                    partial_message = ""
                    last_yield_time = current_time

            # 마지막 출력
            if buffer:
                yield "", history + [[message, buffer]]

            # 대화 히스토리 저장
            chat_history.add_conversation(message, buffer)

        except Exception as e:
            print(f"[스트리밍 중 오류] {str(e)}")
            if not buffer:
                buffer = f"응답 생성 중 오류 발생: {str(e)}"
            yield "", history + [[message, buffer]]

        if thread.is_alive():
            thread.join(timeout=5.0)

        clear_cuda_memory()

    except Exception as e:
        import traceback
        error_details = traceback.format_exc()
        error_message = f"오류가 발생했습니다: {str(e)}\n{error_details}"
        print(f"[Stream chat 오류] {error_message}")
        clear_cuda_memory()
        yield "", history + [[message, error_message]]

# ------------------------- Gradio UI 구성 -------------------------
def create_demo():
    with gr.Blocks(css=CSS) as demo:
        with gr.Column(elem_classes="markdown-style"):
            gr.Markdown("""
                # 🤖 RAGOndevice
                #### 📊 RAG: Upload and Analyze Files (TXT, CSV, PDF, Parquet files)
                Upload your files for data analysis and learning
            """)

        chatbot = gr.Chatbot(
            value=[],
            height=600,
            label="GiniGEN AI Assistant",
            elem_classes="chat-container"
        )

        with gr.Row(elem_classes="input-container"):
            with gr.Column(scale=1, min_width=70):
                file_upload = gr.File(
                    type="filepath",
                    elem_classes="file-upload-icon",
                    scale=1,
                    container=True,
                    interactive=True,
                    show_label=False
                )

            with gr.Column(scale=3):
                msg = gr.Textbox(
                    show_label=False,
                    placeholder="Type your message here... 💭",
                    container=False,
                    elem_classes="input-textbox",
                    scale=1
                )

            with gr.Column(scale=1, min_width=70):
                send = gr.Button(
                    "Send",
                    elem_classes="send-button custom-button",
                    scale=1
                )

            with gr.Column(scale=1, min_width=70):
                clear = gr.Button(
                    "Clear",
                    elem_classes="clear-button custom-button",
                    scale=1
                )

        # 고급 설정
        with gr.Accordion("🎮 Advanced Settings", open=False):
            with gr.Row():
                with gr.Column(scale=1):
                    temperature = gr.Slider(
                        minimum=0, maximum=1, step=0.1, value=0.8,
                        label="Creativity Level 🎨"
                    )
                    max_new_tokens = gr.Slider(
                        minimum=128, maximum=8000, step=1, value=4000,
                        label="Maximum Token Count 📝"
                    )
                with gr.Column(scale=1):
                    top_p = gr.Slider(
                        minimum=0.0, maximum=1.0, step=0.1, value=0.8,
                        label="Diversity Control 🎯"
                    )
                    top_k = gr.Slider(
                        minimum=1, maximum=20, step=1, value=20,
                        label="Selection Range 📊"
                    )
                    penalty = gr.Slider(
                        minimum=0.0, maximum=2.0, step=0.1, value=1.0,
                        label="Repetition Penalty 🔄"
                    )

        # 예시 입력
        gr.Examples(
            examples=[
                ["Please analyze this code and suggest improvements:\ndef fibonacci(n):\n    if n <= 1: return n\n    return fibonacci(n-1) + fibonacci(n-2)"],
                ["Please analyze this data and provide insights:\nAnnual Revenue (Million)\n2019: 1200\n2020: 980\n2021: 1450\n2022: 2100\n2023: 1890"],
                ["Please solve this math problem step by step: 'When a circle's area is twice that of its inscribed square, find the relationship between the circle's radius and the square's side length.'"],
                ["Please analyze this marketing campaign's ROI and suggest improvements:\nTotal Cost: $50,000\nReach: 1M users\nClick Rate: 2.3%\nConversion Rate: 0.8%\nAverage Purchase: $35"],
            ],
            inputs=msg
        )

        # 대화 내용 초기화
        def clear_conversation():
            global current_file_context
            current_file_context = None
            return [], None, "Start a new conversation..."

        # 메시지 전송(Submit)
        msg.submit(
            stream_chat,
            inputs=[msg, chatbot, file_upload, temperature, max_new_tokens, top_p, top_k, penalty],
            outputs=[msg, chatbot]
        )
        send.click(
            stream_chat,
            inputs=[msg, chatbot, file_upload, temperature, max_new_tokens, top_p, top_k, penalty],
            outputs=[msg, chatbot]
        )

        # 파일 업로드 이벤트
        file_upload.change(
            fn=lambda: ("처리 중...", [["시스템", "파일을 분석 중입니다. 잠시만 기다려주세요..."]]),
            outputs=[msg, chatbot],
            queue=False
        ).then(
            fn=init_msg,
            outputs=msg,
            queue=False
        ).then(
            fn=stream_chat,
            inputs=[msg, chatbot, file_upload, temperature, max_new_tokens, top_p, top_k, penalty],
            outputs=[msg, chatbot],
            queue=True
        )

        # Clear 버튼
        clear.click(
            fn=clear_conversation,
            outputs=[chatbot, file_upload, msg],
            queue=False
        )

        return demo

if __name__ == "__main__":
    demo = create_demo()
    demo.launch()