Parquet-CSV-Convertor

Running

App Files Files Community

openfree commited on 5 days ago

Commit

1ec543a

verified ·

1 Parent(s): 5ffc072

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -42

app.py CHANGED Viewed

@@ -3,17 +3,17 @@ import pandas as pd
 from io import BytesIO
 def convert_file(input_file, conversion_type):
-    # 파일이 업로드되었는지 확인
     if input_file is None:
-        return None, "파일을 업로드해 주세요."
-    # 파일 내용 읽기
     try:
-        # 파일 객체에서 읽기 시도
         file_bytes = input_file.read()
         file_name = input_file.name
     except AttributeError:
-        # AttributeError가 발생하면 input_file을 파일 경로로 처리
         file_name = input_file
         with open(file_name, "rb") as f:
             file_bytes = f.read()
@@ -24,59 +24,66 @@ def convert_file(input_file, conversion_type):
     converted_format = None
     try:
-        # 변환: CSV에서 Parquet으로
         if conversion_type == "CSV to Parquet":
             if file_extension != "csv":
-                return None, "CSV에서 Parquet으로 변환하려면 CSV 파일을 업로드해 주세요."
-            # 다양한 인코딩을 시도 (chardet 없이)
             encodings_to_try = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
-            encoding = None
-            for enc in encodings_to_try:
-                try:
-                    df = pd.read_csv(BytesIO(file_bytes), encoding=enc)
-                    encoding = enc
-                    break
-                except UnicodeDecodeError:
-                    continue
-                except Exception as e:
-                    return None, f"CSV 읽기 오류: {str(e)}"
             if df is None:
-                return None, "일반적인 인코딩으로 CSV를 읽지 못했습니다. 파일이 특이한 인코딩을 사용할 수 있습니다."
             output_file = "output.parquet"
             df.to_parquet(output_file, index=False)
             converted_format = "Parquet"
-        # 변환: Parquet에서 CSV로
         elif conversion_type == "Parquet to CSV":
             if file_extension != "parquet":
-                return None, "Parquet에서 CSV로 변환하려면 Parquet 파일을 업로드해 주세요."
             df = pd.read_parquet(BytesIO(file_bytes))
             output_file = "output.csv"
             df.to_csv(output_file, index=False, encoding='utf-8')
             converted_format = "CSV"
         else:
-            return None, "잘못된 변환 유형이 선택되었습니다."
-        # 상위 10개 행의 미리보기 생성
         preview = df.head(10).to_string(index=False)
         info_message = (
-            f"입력 파일: {file_name}\n"
-            f"변환된 파일 형식: {converted_format}\n"
         )
-        if conversion_type == "CSV to Parquet" and encoding:
-            info_message += f"사용된 인코딩: {encoding}\n"
-        info_message += f"\n미리보기 (상위 10개 행):\n{preview}"
         return output_file, info_message
     except Exception as e:
-        return None, f"변환 중 오류 발생: {str(e)}"
 # 모던하고 세련된 스타���을 위한 사용자 정의 CSS
 custom_css = """
@@ -112,33 +119,34 @@ h1, h2 {
 }
 """
-with gr.Blocks(css=custom_css, title="CSV <-> Parquet 변환기") as demo:
-    gr.Markdown("# CSV <-> Parquet 변환기")
-    gr.Markdown("CSV 또는 Parquet 파일을 업로드하고 변환 유형을 선택하세요. 앱은 파일을 반대 형식으로 변환하고 상위 10개 행의 미리보기를 표시합니다.")
     with gr.Row():
         with gr.Column(scale=1):
-            input_file = gr.File(label="CSV 또는 Parquet 파일 업로드")
         with gr.Column(scale=1):
             conversion_type = gr.Radio(
                 choices=["CSV to Parquet", "Parquet to CSV"],
-                label="변환 유형",
-                value="CSV to Parquet"  # 기본값 설정
             )
-    convert_button = gr.Button("변환", elem_classes=["gradio-button"])
     with gr.Row():
-        output_file = gr.File(label="변환된 파일")
-        preview = gr.Textbox(label="미리보기 (상위 10개 행)", lines=15)
     convert_button.click(fn=convert_file, inputs=[input_file, conversion_type], outputs=[output_file, preview])
     gr.Markdown("""
-    ### 참고:
-    - 이 변환기는 일반적인 CSV 인코딩(UTF-8, Latin-1, ISO-8859-1, CP1252)을 시도합니다
-    - Parquet 파일은 CSV보다 데이터 타입을 더 잘 보존합니다
-    - 미리보기는 데이터의 처음 10행만 표시합니다
     """)
 if __name__ == "__main__":

 from io import BytesIO
 def convert_file(input_file, conversion_type):
+    # Check if a file was uploaded
     if input_file is None:
+        return None, "Please upload a file."
+    # Read the file content
     try:
+        # Try reading from file-like object
         file_bytes = input_file.read()
         file_name = input_file.name
     except AttributeError:
+        # If there's an AttributeError, treat input_file as a file path
         file_name = input_file
         with open(file_name, "rb") as f:
             file_bytes = f.read()
     converted_format = None
     try:
+        # Conversion: CSV to Parquet
         if conversion_type == "CSV to Parquet":
             if file_extension != "csv":
+                return None, "For CSV to Parquet conversion, please upload a CSV file."
+            # Set UTF-8 as default encoding and try others if needed
             encodings_to_try = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
+            encoding = 'utf-8'  # Set UTF-8 as default encoding
+            # Try UTF-8 first, then other encodings if it fails
+            try:
+                df = pd.read_csv(BytesIO(file_bytes), encoding=encoding)
+            except UnicodeDecodeError:
+                # Try other encodings if UTF-8 fails
+                for enc in encodings_to_try[1:]:  # Try remaining encodings except utf-8
+                    try:
+                        df = pd.read_csv(BytesIO(file_bytes), encoding=enc)
+                        encoding = enc
+                        break
+                    except UnicodeDecodeError:
+                        continue
+                    except Exception as e:
+                        return None, f"Error reading CSV: {str(e)}"
+            except Exception as e:
+                return None, f"Error reading CSV: {str(e)}"
             if df is None:
+                return None, "Failed to read CSV with any of the common encodings. Your file might use a rare encoding."
             output_file = "output.parquet"
             df.to_parquet(output_file, index=False)
             converted_format = "Parquet"
+        # Conversion: Parquet to CSV
         elif conversion_type == "Parquet to CSV":
             if file_extension != "parquet":
+                return None, "For Parquet to CSV conversion, please upload a Parquet file."
             df = pd.read_parquet(BytesIO(file_bytes))
             output_file = "output.csv"
             df.to_csv(output_file, index=False, encoding='utf-8')
             converted_format = "CSV"
         else:
+            return None, "Invalid conversion type selected."
+        # Generate a preview of the top 10 rows
         preview = df.head(10).to_string(index=False)
         info_message = (
+            f"Input file: {file_name}\n"
+            f"Converted file format: {converted_format}\n"
         )
+        if conversion_type == "CSV to Parquet":
+            info_message += f"Used encoding: {encoding}\n"
+        info_message += f"\nPreview (Top 10 Rows):\n{preview}"
         return output_file, info_message
     except Exception as e:
+        return None, f"Error during conversion: {str(e)}"
 # 모던하고 세련된 스타���을 위한 사용자 정의 CSS
 custom_css = """
 }
 """
+with gr.Blocks(css=custom_css, title="CSV <-> Parquet Converter") as demo:
+    gr.Markdown("# CSV <-> Parquet Converter")
+    gr.Markdown("Upload a CSV or Parquet file and select the conversion type. The app converts the file to the opposite format and displays a preview of the top 10 rows.")
     with gr.Row():
         with gr.Column(scale=1):
+            input_file = gr.File(label="Upload CSV or Parquet File")
         with gr.Column(scale=1):
             conversion_type = gr.Radio(
                 choices=["CSV to Parquet", "Parquet to CSV"],
+                label="Conversion Type",
+                value="CSV to Parquet"  # Set default value
             )
+    convert_button = gr.Button("Convert", elem_classes=["gradio-button"])
     with gr.Row():
+        output_file = gr.File(label="Converted File")
+        preview = gr.Textbox(label="Preview (Top 10 Rows)", lines=15)
     convert_button.click(fn=convert_file, inputs=[input_file, conversion_type], outputs=[output_file, preview])
     gr.Markdown("""
+    ### Notes:
+    - This converter uses UTF-8 as the default encoding
+    - If UTF-8 fails, it tries Latin-1, ISO-8859-1, and CP1252 encodings
+    - Parquet files preserve data types better than CSV
+    - The preview shows only the first 10 rows of data
     """)
 if __name__ == "__main__":