openfree commited on
Commit
5ffc072
Β·
verified Β·
1 Parent(s): fa41b98

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -54
app.py CHANGED
@@ -1,26 +1,19 @@
1
  import gradio as gr
2
  import pandas as pd
3
- import chardet
4
  from io import BytesIO
5
 
6
- def detect_encoding(file_bytes):
7
- """Detect the encoding of the file."""
8
- # Use chardet to detect encoding
9
- result = chardet.detect(file_bytes)
10
- return result['encoding']
11
-
12
  def convert_file(input_file, conversion_type):
13
- # Check if a file was uploaded
14
  if input_file is None:
15
- return None, "Please upload a file."
16
 
17
- # Read the file content
18
  try:
19
- # Try reading from file-like object
20
  file_bytes = input_file.read()
21
  file_name = input_file.name
22
  except AttributeError:
23
- # If there's an AttributeError, treat input_file as a file path
24
  file_name = input_file
25
  with open(file_name, "rb") as f:
26
  file_bytes = f.read()
@@ -31,62 +24,61 @@ def convert_file(input_file, conversion_type):
31
  converted_format = None
32
 
33
  try:
34
- # Conversion: CSV to Parquet
35
  if conversion_type == "CSV to Parquet":
36
  if file_extension != "csv":
37
- return None, "For CSV to Parquet conversion, please upload a CSV file."
 
 
 
 
38
 
39
- # Detect the encoding of the CSV file
40
- encoding = detect_encoding(file_bytes)
 
 
 
 
 
 
 
41
 
42
- # Try to read with detected encoding
43
- try:
44
- df = pd.read_csv(BytesIO(file_bytes), encoding=encoding)
45
- except Exception as e:
46
- # If that fails, try with other common encodings
47
- for enc in ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']:
48
- try:
49
- df = pd.read_csv(BytesIO(file_bytes), encoding=enc)
50
- encoding = enc
51
- break
52
- except:
53
- continue
54
- if df is None:
55
- return None, f"Failed to read CSV with any encoding. Error: {str(e)}"
56
 
57
  output_file = "output.parquet"
58
  df.to_parquet(output_file, index=False)
59
  converted_format = "Parquet"
60
 
61
- # Conversion: Parquet to CSV
62
  elif conversion_type == "Parquet to CSV":
63
  if file_extension != "parquet":
64
- return None, "For Parquet to CSV conversion, please upload a Parquet file."
65
 
66
  df = pd.read_parquet(BytesIO(file_bytes))
67
  output_file = "output.csv"
68
  df.to_csv(output_file, index=False, encoding='utf-8')
69
  converted_format = "CSV"
70
  else:
71
- return None, "Invalid conversion type selected."
72
 
73
- # Generate a preview of the top 10 rows
74
  preview = df.head(10).to_string(index=False)
75
  info_message = (
76
- f"Input file: {file_name}\n"
77
- f"Converted file format: {converted_format}\n"
78
  )
79
- if conversion_type == "CSV to Parquet":
80
- info_message += f"Detected encoding: {encoding}\n"
81
 
82
- info_message += f"\nPreview (Top 10 Rows):\n{preview}"
83
 
84
  return output_file, info_message
85
 
86
  except Exception as e:
87
- return None, f"Error during conversion: {str(e)}"
88
 
89
- # Custom CSS for a modern and sleek look
90
  custom_css = """
91
  body {
92
  background-color: #f4f4f4;
@@ -120,33 +112,33 @@ h1, h2 {
120
  }
121
  """
122
 
123
- with gr.Blocks(css=custom_css, title="CSV <-> Parquet Converter") as demo:
124
- gr.Markdown("# CSV <-> Parquet Converter")
125
- gr.Markdown("Upload a CSV or Parquet file and select the conversion type. The app converts the file to the opposite format and displays a preview of the top 10 rows.")
126
 
127
  with gr.Row():
128
  with gr.Column(scale=1):
129
- input_file = gr.File(label="Upload CSV or Parquet File")
130
  with gr.Column(scale=1):
131
  conversion_type = gr.Radio(
132
  choices=["CSV to Parquet", "Parquet to CSV"],
133
- label="Conversion Type",
134
- value="CSV to Parquet" # Set default value
135
  )
136
 
137
- convert_button = gr.Button("Convert", elem_classes=["gradio-button"])
138
 
139
  with gr.Row():
140
- output_file = gr.File(label="Converted File")
141
- preview = gr.Textbox(label="Preview (Top 10 Rows)", lines=15)
142
 
143
  convert_button.click(fn=convert_file, inputs=[input_file, conversion_type], outputs=[output_file, preview])
144
 
145
  gr.Markdown("""
146
- ### Notes:
147
- - This converter can handle various CSV encodings
148
- - Parquet files are always encoded in UTF-8
149
- - The preview shows only the first 10 rows of data
150
  """)
151
 
152
  if __name__ == "__main__":
 
1
  import gradio as gr
2
  import pandas as pd
 
3
  from io import BytesIO
4
 
 
 
 
 
 
 
5
  def convert_file(input_file, conversion_type):
6
+ # 파일이 μ—…λ‘œλ“œλ˜μ—ˆλŠ”μ§€ 확인
7
  if input_file is None:
8
+ return None, "νŒŒμΌμ„ μ—…λ‘œλ“œν•΄ μ£Όμ„Έμš”."
9
 
10
+ # 파일 λ‚΄μš© 읽기
11
  try:
12
+ # 파일 κ°μ²΄μ—μ„œ 읽기 μ‹œλ„
13
  file_bytes = input_file.read()
14
  file_name = input_file.name
15
  except AttributeError:
16
+ # AttributeErrorκ°€ λ°œμƒν•˜λ©΄ input_file을 파일 경둜둜 처리
17
  file_name = input_file
18
  with open(file_name, "rb") as f:
19
  file_bytes = f.read()
 
24
  converted_format = None
25
 
26
  try:
27
+ # λ³€ν™˜: CSVμ—μ„œ Parquet으둜
28
  if conversion_type == "CSV to Parquet":
29
  if file_extension != "csv":
30
+ return None, "CSVμ—μ„œ Parquet으둜 λ³€ν™˜ν•˜λ €λ©΄ CSV νŒŒμΌμ„ μ—…λ‘œλ“œν•΄ μ£Όμ„Έμš”."
31
+
32
+ # λ‹€μ–‘ν•œ 인코딩을 μ‹œλ„ (chardet 없이)
33
+ encodings_to_try = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
34
+ encoding = None
35
 
36
+ for enc in encodings_to_try:
37
+ try:
38
+ df = pd.read_csv(BytesIO(file_bytes), encoding=enc)
39
+ encoding = enc
40
+ break
41
+ except UnicodeDecodeError:
42
+ continue
43
+ except Exception as e:
44
+ return None, f"CSV 읽기 였λ₯˜: {str(e)}"
45
 
46
+ if df is None:
47
+ return None, "일반적인 μΈμ½”λ”©μœΌλ‘œ CSVλ₯Ό 읽지 λͺ»ν–ˆμŠ΅λ‹ˆλ‹€. 파일이 νŠΉμ΄ν•œ 인코딩을 μ‚¬μš©ν•  수 μžˆμŠ΅λ‹ˆλ‹€."
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  output_file = "output.parquet"
50
  df.to_parquet(output_file, index=False)
51
  converted_format = "Parquet"
52
 
53
+ # λ³€ν™˜: Parquetμ—μ„œ CSV둜
54
  elif conversion_type == "Parquet to CSV":
55
  if file_extension != "parquet":
56
+ return None, "Parquetμ—μ„œ CSV둜 λ³€ν™˜ν•˜λ €λ©΄ Parquet νŒŒμΌμ„ μ—…λ‘œλ“œν•΄ μ£Όμ„Έμš”."
57
 
58
  df = pd.read_parquet(BytesIO(file_bytes))
59
  output_file = "output.csv"
60
  df.to_csv(output_file, index=False, encoding='utf-8')
61
  converted_format = "CSV"
62
  else:
63
+ return None, "잘λͺ»λœ λ³€ν™˜ μœ ν˜•μ΄ μ„ νƒλ˜μ—ˆμŠ΅λ‹ˆλ‹€."
64
 
65
+ # μƒμœ„ 10개 ν–‰μ˜ 미리보기 생성
66
  preview = df.head(10).to_string(index=False)
67
  info_message = (
68
+ f"μž…λ ₯ 파일: {file_name}\n"
69
+ f"λ³€ν™˜λœ 파일 ν˜•μ‹: {converted_format}\n"
70
  )
71
+ if conversion_type == "CSV to Parquet" and encoding:
72
+ info_message += f"μ‚¬μš©λœ 인코딩: {encoding}\n"
73
 
74
+ info_message += f"\n미리보기 (μƒμœ„ 10개 ν–‰):\n{preview}"
75
 
76
  return output_file, info_message
77
 
78
  except Exception as e:
79
+ return None, f"λ³€ν™˜ 쀑 였λ₯˜ λ°œμƒ: {str(e)}"
80
 
81
+ # λͺ¨λ˜ν•˜κ³  μ„Έλ ¨λœ μŠ€νƒ€μΌμ„ μœ„οΏ½οΏ½οΏ½ μ‚¬μš©μž μ •μ˜ CSS
82
  custom_css = """
83
  body {
84
  background-color: #f4f4f4;
 
112
  }
113
  """
114
 
115
+ with gr.Blocks(css=custom_css, title="CSV <-> Parquet λ³€ν™˜κΈ°") as demo:
116
+ gr.Markdown("# CSV <-> Parquet λ³€ν™˜κΈ°")
117
+ gr.Markdown("CSV λ˜λŠ” Parquet νŒŒμΌμ„ μ—…λ‘œλ“œν•˜κ³  λ³€ν™˜ μœ ν˜•μ„ μ„ νƒν•˜μ„Έμš”. 앱은 νŒŒμΌμ„ λ°˜λŒ€ ν˜•μ‹μœΌλ‘œ λ³€ν™˜ν•˜κ³  μƒμœ„ 10개 ν–‰μ˜ 미리보기λ₯Ό ν‘œμ‹œν•©λ‹ˆλ‹€.")
118
 
119
  with gr.Row():
120
  with gr.Column(scale=1):
121
+ input_file = gr.File(label="CSV λ˜λŠ” Parquet 파일 μ—…λ‘œλ“œ")
122
  with gr.Column(scale=1):
123
  conversion_type = gr.Radio(
124
  choices=["CSV to Parquet", "Parquet to CSV"],
125
+ label="λ³€ν™˜ μœ ν˜•",
126
+ value="CSV to Parquet" # κΈ°λ³Έκ°’ μ„€μ •
127
  )
128
 
129
+ convert_button = gr.Button("λ³€ν™˜", elem_classes=["gradio-button"])
130
 
131
  with gr.Row():
132
+ output_file = gr.File(label="λ³€ν™˜λœ 파일")
133
+ preview = gr.Textbox(label="미리보기 (μƒμœ„ 10개 ν–‰)", lines=15)
134
 
135
  convert_button.click(fn=convert_file, inputs=[input_file, conversion_type], outputs=[output_file, preview])
136
 
137
  gr.Markdown("""
138
+ ### μ°Έκ³ :
139
+ - 이 λ³€ν™˜κΈ°λŠ” 일반적인 CSV 인코딩(UTF-8, Latin-1, ISO-8859-1, CP1252)을 μ‹œλ„ν•©λ‹ˆλ‹€
140
+ - Parquet νŒŒμΌμ€ CSV보닀 데이터 νƒ€μž…μ„ 더 잘 λ³΄μ‘΄ν•©λ‹ˆλ‹€
141
+ - λ―Έλ¦¬λ³΄κΈ°λŠ” λ°μ΄ν„°μ˜ 처음 10ν–‰λ§Œ ν‘œμ‹œν•©λ‹ˆλ‹€
142
  """)
143
 
144
  if __name__ == "__main__":