Parquet-CSV-Convertor

Running

App Files Files Community

openfree commited on 6 days ago

Commit

5ffc072

verified ·

1 Parent(s): fa41b98

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -54

app.py CHANGED Viewed

@@ -1,26 +1,19 @@
 import gradio as gr
 import pandas as pd
-import chardet
 from io import BytesIO
-def detect_encoding(file_bytes):
-    """Detect the encoding of the file."""
-    # Use chardet to detect encoding
-    result = chardet.detect(file_bytes)
-    return result['encoding']
 def convert_file(input_file, conversion_type):
-    # Check if a file was uploaded
     if input_file is None:
-        return None, "Please upload a file."
-    # Read the file content
     try:
-        # Try reading from file-like object
         file_bytes = input_file.read()
         file_name = input_file.name
     except AttributeError:
-        # If there's an AttributeError, treat input_file as a file path
         file_name = input_file
         with open(file_name, "rb") as f:
             file_bytes = f.read()
@@ -31,62 +24,61 @@ def convert_file(input_file, conversion_type):
     converted_format = None
     try:
-        # Conversion: CSV to Parquet
         if conversion_type == "CSV to Parquet":
             if file_extension != "csv":
-                return None, "For CSV to Parquet conversion, please upload a CSV file."
-            # Detect the encoding of the CSV file
-            encoding = detect_encoding(file_bytes)
-            # Try to read with detected encoding
-            try:
-                df = pd.read_csv(BytesIO(file_bytes), encoding=encoding)
-            except Exception as e:
-                # If that fails, try with other common encodings
-                for enc in ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']:
-                    try:
-                        df = pd.read_csv(BytesIO(file_bytes), encoding=enc)
-                        encoding = enc
-                        break
-                    except:
-                        continue
-                if df is None:
-                    return None, f"Failed to read CSV with any encoding. Error: {str(e)}"
             output_file = "output.parquet"
             df.to_parquet(output_file, index=False)
             converted_format = "Parquet"
-        # Conversion: Parquet to CSV
         elif conversion_type == "Parquet to CSV":
             if file_extension != "parquet":
-                return None, "For Parquet to CSV conversion, please upload a Parquet file."
             df = pd.read_parquet(BytesIO(file_bytes))
             output_file = "output.csv"
             df.to_csv(output_file, index=False, encoding='utf-8')
             converted_format = "CSV"
         else:
-            return None, "Invalid conversion type selected."
-        # Generate a preview of the top 10 rows
         preview = df.head(10).to_string(index=False)
         info_message = (
-            f"Input file: {file_name}\n"
-            f"Converted file format: {converted_format}\n"
         )
-        if conversion_type == "CSV to Parquet":
-            info_message += f"Detected encoding: {encoding}\n"
-        info_message += f"\nPreview (Top 10 Rows):\n{preview}"
         return output_file, info_message
     except Exception as e:
-        return None, f"Error during conversion: {str(e)}"
-# Custom CSS for a modern and sleek look
 custom_css = """
 body {
     background-color: #f4f4f4;
@@ -120,33 +112,33 @@ h1, h2 {
 }
 """
-with gr.Blocks(css=custom_css, title="CSV <-> Parquet Converter") as demo:
-    gr.Markdown("# CSV <-> Parquet Converter")
-    gr.Markdown("Upload a CSV or Parquet file and select the conversion type. The app converts the file to the opposite format and displays a preview of the top 10 rows.")
     with gr.Row():
         with gr.Column(scale=1):
-            input_file = gr.File(label="Upload CSV or Parquet File")
         with gr.Column(scale=1):
             conversion_type = gr.Radio(
                 choices=["CSV to Parquet", "Parquet to CSV"],
-                label="Conversion Type",
-                value="CSV to Parquet"  # Set default value
             )
-    convert_button = gr.Button("Convert", elem_classes=["gradio-button"])
     with gr.Row():
-        output_file = gr.File(label="Converted File")
-        preview = gr.Textbox(label="Preview (Top 10 Rows)", lines=15)
     convert_button.click(fn=convert_file, inputs=[input_file, conversion_type], outputs=[output_file, preview])
     gr.Markdown("""
-    ### Notes:
-    - This converter can handle various CSV encodings
-    - Parquet files are always encoded in UTF-8
-    - The preview shows only the first 10 rows of data
     """)
 if __name__ == "__main__":

 import gradio as gr
 import pandas as pd
 from io import BytesIO
 def convert_file(input_file, conversion_type):
+    # 파일이 업로드되었는지 확인
     if input_file is None:
+        return None, "파일을 업로드해 주세요."
+    # 파일 내용 읽기
     try:
+        # 파일 객체에서 읽기 시도
         file_bytes = input_file.read()
         file_name = input_file.name
     except AttributeError:
+        # AttributeError가 발생하면 input_file을 파일 경로로 처리
         file_name = input_file
         with open(file_name, "rb") as f:
             file_bytes = f.read()
     converted_format = None
     try:
+        # 변환: CSV에서 Parquet으로
         if conversion_type == "CSV to Parquet":
             if file_extension != "csv":
+                return None, "CSV에서 Parquet으로 변환하려면 CSV 파일을 업로드해 주세요."
+            # 다양한 인코딩을 시도 (chardet 없이)
+            encodings_to_try = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
+            encoding = None
+            for enc in encodings_to_try:
+                try:
+                    df = pd.read_csv(BytesIO(file_bytes), encoding=enc)
+                    encoding = enc
+                    break
+                except UnicodeDecodeError:
+                    continue
+                except Exception as e:
+                    return None, f"CSV 읽기 오류: {str(e)}"
+            if df is None:
+                return None, "일반적인 인코딩으로 CSV를 읽지 못했습니다. 파일이 특이한 인코딩을 사용할 수 있습니다."
             output_file = "output.parquet"
             df.to_parquet(output_file, index=False)
             converted_format = "Parquet"
+        # 변환: Parquet에서 CSV로
         elif conversion_type == "Parquet to CSV":
             if file_extension != "parquet":
+                return None, "Parquet에서 CSV로 변환하려면 Parquet 파일을 업로드해 주세요."
             df = pd.read_parquet(BytesIO(file_bytes))
             output_file = "output.csv"
             df.to_csv(output_file, index=False, encoding='utf-8')
             converted_format = "CSV"
         else:
+            return None, "잘못된 변환 유형이 선택되었습니다."
+        # 상위 10개 행의 미리보기 생성
         preview = df.head(10).to_string(index=False)
         info_message = (
+            f"입력 파일: {file_name}\n"
+            f"변환된 파일 형식: {converted_format}\n"
         )
+        if conversion_type == "CSV to Parquet" and encoding:
+            info_message += f"사용된 인코딩: {encoding}\n"
+        info_message += f"\n미리보기 (상위 10개 행):\n{preview}"
         return output_file, info_message
     except Exception as e:
+        return None, f"변환 중 오류 발생: {str(e)}"
+# 모던하고 세련된 스타일을 위��� 사용자 정의 CSS
 custom_css = """
 body {
     background-color: #f4f4f4;
 }
 """
+with gr.Blocks(css=custom_css, title="CSV <-> Parquet 변환기") as demo:
+    gr.Markdown("# CSV <-> Parquet 변환기")
+    gr.Markdown("CSV 또는 Parquet 파일을 업로드하고 변환 유형을 선택하세요. 앱은 파일을 반대 형식으로 변환하고 상위 10개 행의 미리보기를 표시합니다.")
     with gr.Row():
         with gr.Column(scale=1):
+            input_file = gr.File(label="CSV 또는 Parquet 파일 업로드")
         with gr.Column(scale=1):
             conversion_type = gr.Radio(
                 choices=["CSV to Parquet", "Parquet to CSV"],
+                label="변환 유형",
+                value="CSV to Parquet"  # 기본값 설정
             )
+    convert_button = gr.Button("변환", elem_classes=["gradio-button"])
     with gr.Row():
+        output_file = gr.File(label="변환된 파일")
+        preview = gr.Textbox(label="미리보기 (상위 10개 행)", lines=15)
     convert_button.click(fn=convert_file, inputs=[input_file, conversion_type], outputs=[output_file, preview])
     gr.Markdown("""
+    ### 참고:
+    - 이 변환기는 일반적인 CSV 인코딩(UTF-8, Latin-1, ISO-8859-1, CP1252)을 시도합니다
+    - Parquet 파일은 CSV보다 데이터 타입을 더 잘 보존합니다
+    - 미리보기는 데이터의 처음 10행만 표시합니다
     """)
 if __name__ == "__main__":