openfree commited on
Commit
1ec543a
Β·
verified Β·
1 Parent(s): 5ffc072

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -42
app.py CHANGED
@@ -3,17 +3,17 @@ import pandas as pd
3
  from io import BytesIO
4
 
5
  def convert_file(input_file, conversion_type):
6
- # 파일이 μ—…λ‘œλ“œλ˜μ—ˆλŠ”μ§€ 확인
7
  if input_file is None:
8
- return None, "νŒŒμΌμ„ μ—…λ‘œλ“œν•΄ μ£Όμ„Έμš”."
9
 
10
- # 파일 λ‚΄μš© 읽기
11
  try:
12
- # 파일 κ°μ²΄μ—μ„œ 읽기 μ‹œλ„
13
  file_bytes = input_file.read()
14
  file_name = input_file.name
15
  except AttributeError:
16
- # AttributeErrorκ°€ λ°œμƒν•˜λ©΄ input_file을 파일 경둜둜 처리
17
  file_name = input_file
18
  with open(file_name, "rb") as f:
19
  file_bytes = f.read()
@@ -24,59 +24,66 @@ def convert_file(input_file, conversion_type):
24
  converted_format = None
25
 
26
  try:
27
- # λ³€ν™˜: CSVμ—μ„œ Parquet으둜
28
  if conversion_type == "CSV to Parquet":
29
  if file_extension != "csv":
30
- return None, "CSVμ—μ„œ Parquet으둜 λ³€ν™˜ν•˜λ €λ©΄ CSV νŒŒμΌμ„ μ—…λ‘œλ“œν•΄ μ£Όμ„Έμš”."
31
 
32
- # λ‹€μ–‘ν•œ 인코딩을 μ‹œλ„ (chardet 없이)
33
  encodings_to_try = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
34
- encoding = None
35
 
36
- for enc in encodings_to_try:
37
- try:
38
- df = pd.read_csv(BytesIO(file_bytes), encoding=enc)
39
- encoding = enc
40
- break
41
- except UnicodeDecodeError:
42
- continue
43
- except Exception as e:
44
- return None, f"CSV 읽기 였λ₯˜: {str(e)}"
 
 
 
 
 
 
 
45
 
46
  if df is None:
47
- return None, "일반적인 μΈμ½”λ”©μœΌλ‘œ CSVλ₯Ό 읽지 λͺ»ν–ˆμŠ΅λ‹ˆλ‹€. 파일이 νŠΉμ΄ν•œ 인코딩을 μ‚¬μš©ν•  수 μžˆμŠ΅λ‹ˆλ‹€."
48
 
49
  output_file = "output.parquet"
50
  df.to_parquet(output_file, index=False)
51
  converted_format = "Parquet"
52
 
53
- # λ³€ν™˜: Parquetμ—μ„œ CSV둜
54
  elif conversion_type == "Parquet to CSV":
55
  if file_extension != "parquet":
56
- return None, "Parquetμ—μ„œ CSV둜 λ³€ν™˜ν•˜λ €λ©΄ Parquet νŒŒμΌμ„ μ—…λ‘œλ“œν•΄ μ£Όμ„Έμš”."
57
 
58
  df = pd.read_parquet(BytesIO(file_bytes))
59
  output_file = "output.csv"
60
  df.to_csv(output_file, index=False, encoding='utf-8')
61
  converted_format = "CSV"
62
  else:
63
- return None, "잘λͺ»λœ λ³€ν™˜ μœ ν˜•μ΄ μ„ νƒλ˜μ—ˆμŠ΅λ‹ˆλ‹€."
64
 
65
- # μƒμœ„ 10개 ν–‰μ˜ 미리보기 생성
66
  preview = df.head(10).to_string(index=False)
67
  info_message = (
68
- f"μž…λ ₯ 파일: {file_name}\n"
69
- f"λ³€ν™˜λœ 파일 ν˜•μ‹: {converted_format}\n"
70
  )
71
- if conversion_type == "CSV to Parquet" and encoding:
72
- info_message += f"μ‚¬μš©λœ 인코딩: {encoding}\n"
73
 
74
- info_message += f"\n미리보기 (μƒμœ„ 10개 ν–‰):\n{preview}"
75
 
76
  return output_file, info_message
77
 
78
  except Exception as e:
79
- return None, f"λ³€ν™˜ 쀑 였λ₯˜ λ°œμƒ: {str(e)}"
80
 
81
  # λͺ¨λ˜ν•˜κ³  μ„Έλ ¨λœ μŠ€νƒ€οΏ½οΏ½οΏ½μ„ μœ„ν•œ μ‚¬μš©μž μ •μ˜ CSS
82
  custom_css = """
@@ -112,33 +119,34 @@ h1, h2 {
112
  }
113
  """
114
 
115
- with gr.Blocks(css=custom_css, title="CSV <-> Parquet λ³€ν™˜κΈ°") as demo:
116
- gr.Markdown("# CSV <-> Parquet λ³€ν™˜κΈ°")
117
- gr.Markdown("CSV λ˜λŠ” Parquet νŒŒμΌμ„ μ—…λ‘œλ“œν•˜κ³  λ³€ν™˜ μœ ν˜•μ„ μ„ νƒν•˜μ„Έμš”. 앱은 νŒŒμΌμ„ λ°˜λŒ€ ν˜•μ‹μœΌλ‘œ λ³€ν™˜ν•˜κ³  μƒμœ„ 10개 ν–‰μ˜ 미리보기λ₯Ό ν‘œμ‹œν•©λ‹ˆλ‹€.")
118
 
119
  with gr.Row():
120
  with gr.Column(scale=1):
121
- input_file = gr.File(label="CSV λ˜λŠ” Parquet 파일 μ—…λ‘œλ“œ")
122
  with gr.Column(scale=1):
123
  conversion_type = gr.Radio(
124
  choices=["CSV to Parquet", "Parquet to CSV"],
125
- label="λ³€ν™˜ μœ ν˜•",
126
- value="CSV to Parquet" # κΈ°λ³Έκ°’ μ„€μ •
127
  )
128
 
129
- convert_button = gr.Button("λ³€ν™˜", elem_classes=["gradio-button"])
130
 
131
  with gr.Row():
132
- output_file = gr.File(label="λ³€ν™˜λœ 파일")
133
- preview = gr.Textbox(label="미리보기 (μƒμœ„ 10개 ν–‰)", lines=15)
134
 
135
  convert_button.click(fn=convert_file, inputs=[input_file, conversion_type], outputs=[output_file, preview])
136
 
137
  gr.Markdown("""
138
- ### μ°Έκ³ :
139
- - 이 λ³€ν™˜κΈ°λŠ” 일반적인 CSV 인코딩(UTF-8, Latin-1, ISO-8859-1, CP1252)을 μ‹œλ„ν•©λ‹ˆλ‹€
140
- - Parquet νŒŒμΌμ€ CSV보닀 데이터 νƒ€μž…μ„ 더 잘 λ³΄μ‘΄ν•©λ‹ˆλ‹€
141
- - λ―Έλ¦¬λ³΄κΈ°λŠ” λ°μ΄ν„°μ˜ 처음 10ν–‰λ§Œ ν‘œμ‹œν•©λ‹ˆλ‹€
 
142
  """)
143
 
144
  if __name__ == "__main__":
 
3
  from io import BytesIO
4
 
5
  def convert_file(input_file, conversion_type):
6
+ # Check if a file was uploaded
7
  if input_file is None:
8
+ return None, "Please upload a file."
9
 
10
+ # Read the file content
11
  try:
12
+ # Try reading from file-like object
13
  file_bytes = input_file.read()
14
  file_name = input_file.name
15
  except AttributeError:
16
+ # If there's an AttributeError, treat input_file as a file path
17
  file_name = input_file
18
  with open(file_name, "rb") as f:
19
  file_bytes = f.read()
 
24
  converted_format = None
25
 
26
  try:
27
+ # Conversion: CSV to Parquet
28
  if conversion_type == "CSV to Parquet":
29
  if file_extension != "csv":
30
+ return None, "For CSV to Parquet conversion, please upload a CSV file."
31
 
32
+ # Set UTF-8 as default encoding and try others if needed
33
  encodings_to_try = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
34
+ encoding = 'utf-8' # Set UTF-8 as default encoding
35
 
36
+ # Try UTF-8 first, then other encodings if it fails
37
+ try:
38
+ df = pd.read_csv(BytesIO(file_bytes), encoding=encoding)
39
+ except UnicodeDecodeError:
40
+ # Try other encodings if UTF-8 fails
41
+ for enc in encodings_to_try[1:]: # Try remaining encodings except utf-8
42
+ try:
43
+ df = pd.read_csv(BytesIO(file_bytes), encoding=enc)
44
+ encoding = enc
45
+ break
46
+ except UnicodeDecodeError:
47
+ continue
48
+ except Exception as e:
49
+ return None, f"Error reading CSV: {str(e)}"
50
+ except Exception as e:
51
+ return None, f"Error reading CSV: {str(e)}"
52
 
53
  if df is None:
54
+ return None, "Failed to read CSV with any of the common encodings. Your file might use a rare encoding."
55
 
56
  output_file = "output.parquet"
57
  df.to_parquet(output_file, index=False)
58
  converted_format = "Parquet"
59
 
60
+ # Conversion: Parquet to CSV
61
  elif conversion_type == "Parquet to CSV":
62
  if file_extension != "parquet":
63
+ return None, "For Parquet to CSV conversion, please upload a Parquet file."
64
 
65
  df = pd.read_parquet(BytesIO(file_bytes))
66
  output_file = "output.csv"
67
  df.to_csv(output_file, index=False, encoding='utf-8')
68
  converted_format = "CSV"
69
  else:
70
+ return None, "Invalid conversion type selected."
71
 
72
+ # Generate a preview of the top 10 rows
73
  preview = df.head(10).to_string(index=False)
74
  info_message = (
75
+ f"Input file: {file_name}\n"
76
+ f"Converted file format: {converted_format}\n"
77
  )
78
+ if conversion_type == "CSV to Parquet":
79
+ info_message += f"Used encoding: {encoding}\n"
80
 
81
+ info_message += f"\nPreview (Top 10 Rows):\n{preview}"
82
 
83
  return output_file, info_message
84
 
85
  except Exception as e:
86
+ return None, f"Error during conversion: {str(e)}"
87
 
88
  # λͺ¨λ˜ν•˜κ³  μ„Έλ ¨λœ μŠ€νƒ€οΏ½οΏ½οΏ½μ„ μœ„ν•œ μ‚¬μš©μž μ •μ˜ CSS
89
  custom_css = """
 
119
  }
120
  """
121
 
122
+ with gr.Blocks(css=custom_css, title="CSV <-> Parquet Converter") as demo:
123
+ gr.Markdown("# CSV <-> Parquet Converter")
124
+ gr.Markdown("Upload a CSV or Parquet file and select the conversion type. The app converts the file to the opposite format and displays a preview of the top 10 rows.")
125
 
126
  with gr.Row():
127
  with gr.Column(scale=1):
128
+ input_file = gr.File(label="Upload CSV or Parquet File")
129
  with gr.Column(scale=1):
130
  conversion_type = gr.Radio(
131
  choices=["CSV to Parquet", "Parquet to CSV"],
132
+ label="Conversion Type",
133
+ value="CSV to Parquet" # Set default value
134
  )
135
 
136
+ convert_button = gr.Button("Convert", elem_classes=["gradio-button"])
137
 
138
  with gr.Row():
139
+ output_file = gr.File(label="Converted File")
140
+ preview = gr.Textbox(label="Preview (Top 10 Rows)", lines=15)
141
 
142
  convert_button.click(fn=convert_file, inputs=[input_file, conversion_type], outputs=[output_file, preview])
143
 
144
  gr.Markdown("""
145
+ ### Notes:
146
+ - This converter uses UTF-8 as the default encoding
147
+ - If UTF-8 fails, it tries Latin-1, ISO-8859-1, and CP1252 encodings
148
+ - Parquet files preserve data types better than CSV
149
+ - The preview shows only the first 10 rows of data
150
  """)
151
 
152
  if __name__ == "__main__":