카테고리 없음
csv reader
유키공
2025. 4. 30. 13:08
import sys
import os
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import json
from PyQt5.QtWidgets import (
QApplication, QMainWindow, QTableView, QFileDialog,
QVBoxLayout, QWidget, QPushButton, QLabel,
QStatusBar, QMessageBox, QLineEdit, QHBoxLayout,
QComboBox, QHeaderView, QProgressDialog
)
from PyQt5.QtCore import (
Qt, QAbstractTableModel, QSortFilterProxyModel,
QThread, pyqtSignal, QObject
)
class LoadWorker(QThread):
"""데이터 로딩 작업 스레드 (Dict 오류 처리 추가)"""
progress = pyqtSignal(int)
finished = pyqtSignal(pd.DataFrame)
error = pyqtSignal(str)
def __init__(self, file_path):
super().__init__()
self.file_path = file_path
def safe_json_dumps(self, obj):
"""Dict/List 타입을 안전하게 JSON 문자열로 변환"""
try:
if isinstance(obj, (dict, list)):
return json.dumps(obj, ensure_ascii=False)
return str(obj)
except:
return "[Conversion Error]"
def convert_complex_types(self, df):
"""DataFrame 내의 복합 타입(dict, list)을 문자열로 변환"""
for col in df.columns:
try:
# 첫 번째 행의 값으로 타입 체크
sample = df[col].iloc[0] if len(df) > 0 else None
if isinstance(sample, (dict, list)):
df[col] = df[col].apply(self.safe_json_dumps)
elif not pd.api.types.is_string_dtype(df[col]):
df[col] = df[col].astype(str)
except Exception as e:
print(f"컬럼 {col} 처리 오류: {e}")
df[col] = df[col].astype(str)
return df
def run(self):
try:
self.progress.emit(5)
if self.file_path.endswith('.parquet'):
# Parquet 파일 로드
parquet_file = pq.ParquetFile(self.file_path)
num_row_groups = parquet_file.num_row_groups
chunks = []
for i in range(num_row_groups):
self.progress.emit(10 + int((i+1)/num_row_groups*70))
table = parquet_file.read_row_group(i)
df = table.to_pandas()
df = self.convert_complex_types(df) # Dict/List 처리
chunks.append(df)
self.progress.emit(90)
result_df = pd.concat(chunks, ignore_index=True)
else:
# CSV 파일 로드
chunksize = 100000
chunks = []
total_rows = sum(1 for _ in open(self.file_path, 'r', encoding='utf-8')) - 1
processed_rows = 0
for chunk in pd.read_csv(self.file_path, chunksize=chunksize):
progress = 10 + int(processed_rows / total_rows * 70)
self.progress.emit(progress)
chunk = self.convert_complex_types(chunk) # Dict/List 처리
chunks.append(chunk)
processed_rows += len(chunk)
self.progress.emit(90)
result_df = pd.concat(chunks, ignore_index=True)
self.progress.emit(95)
result_df = result_df.fillna("") # NULL 값 처리
self.progress.emit(100)
self.finished.emit(result_df)
except Exception as e:
error_msg = f"로딩 실패: {str(e)}\n\n{traceback.format_exc()}"
self.error.emit(error_msg)
class DataFrameModel(QAbstractTableModel):
"""Dict 타입을 안전하게 처리하는 데이터 모델"""
def __init__(self, data):
super().__init__()
self._data = data
def rowCount(self, parent=None):
return len(self._data)
def columnCount(self, parent=None):
return len(self._data.columns)
def data(self, index, role=Qt.DisplayRole):
if not index.isValid():
return None
value = self._data.iloc[index.row(), index.column()]
if role == Qt.DisplayRole:
return str(value) if not pd.isna(value) else ""
elif role == Qt.BackgroundRole:
if isinstance(value, (dict, list)):
return QColor(240, 248, 255) # 복합 타입 배경색
return QColor(255, 255, 255)
return None
def headerData(self, section, orientation, role):
if role == Qt.DisplayRole:
if orientation == Qt.Horizontal:
return str(self._data.columns[section])
return str(self._data.index[section])
return None
class DataViewer(QMainWindow):
def __init__(self):
super().__init__()
self.setWindowTitle("Data Viewer with Dict Handling")
self.setGeometry(100, 100, 1200, 800)
self.setup_ui()
def setup_ui(self):
self.central_widget = QWidget()
self.setCentralWidget(self.central_widget)
layout = QVBoxLayout(self.central_widget)
# 컨트롤 패널
control_panel = QWidget()
control_layout = QHBoxLayout(control_panel)
self.btn_open = QPushButton("파일 열기")
self.btn_open.clicked.connect(self.open_file)
control_layout.addWidget(self.btn_open)
self.search_input = QLineEdit()
self.search_input.setPlaceholderText("검색어 입력")
control_layout.addWidget(self.search_input)
self.column_combo = QComboBox()
self.column_combo.addItem("모든 컬럼")
control_layout.addWidget(self.column_combo)
layout.addWidget(control_panel)
# 테이블 뷰
self.table_view = QTableView()
self.table_view.setSortingEnabled(True)
self.proxy_model = QSortFilterProxyModel()
self.proxy_model.setFilterCaseSensitivity(Qt.CaseInsensitive)
self.table_view.setModel(self.proxy_model)
layout.addWidget(self.table_view)
# 상태바
self.status_bar = QStatusBar()
self.setStatusBar(self.status_bar)
# 로딩 다이얼로그
self.progress_dialog = QProgressDialog("파일을 로드 중입니다...", "취소", 0, 100, self)
self.progress_dialog.setWindowModality(Qt.WindowModal)
self.progress_dialog.canceled.connect(self.cancel_loading)
def open_file(self):
file_path, _ = QFileDialog.getOpenFileName(
self, "파일 열기", "",
"데이터 파일 (*.parquet *.csv);;모든 파일 (*)")
if file_path:
self.load_data(file_path)
def load_data(self, file_path):
"""데이터 로드 및 진행률 표시"""
self.progress_dialog.reset()
self.progress_dialog.show()
self.btn_open.setEnabled(False)
self.load_worker = LoadWorker(file_path)
self.load_worker.progress.connect(self.update_progress)
self.load_worker.finished.connect(self.data_load_complete)
self.load_worker.error.connect(self.data_load_error)
self.load_worker.start()
def update_progress(self, value):
"""진행률 업데이트"""
self.progress_dialog.setValue(value)
def data_load_complete(self, df):
"""데이터 로드 완료 처리"""
self.progress_dialog.reset()
self.btn_open.setEnabled(True)
# 데이터 모델 설정
model = DataFrameModel(df)
self.proxy_model.setSourceModel(model)
# 컬럼 목록 업데이트
self.column_combo.clear()
self.column_combo.addItem("모든 컬럼")
self.column_combo.addItems(df.columns.tolist())
# 상태바 업데이트
file_size = os.path.getsize(self.load_worker.file_path) / (1024 * 1024) # MB 단위
self.status_bar.showMessage(
f"로드 완료: {len(df):,}행 | {len(df.columns)}열 | {file_size:.2f}MB | "
f"Dict/List 컬럼: {self.count_complex_columns(df)}개"
)
def count_complex_columns(self, df):
"""Dict/List 타입 컬럼 수 카운트"""
count = 0
for col in df.columns:
sample = df[col].iloc[0] if len(df) > 0 else None
if isinstance(sample, (dict, list)):
count += 1
return count
def data_load_error(self, error_msg):
"""데이터 로드 오류 처리"""
self.progress_dialog.reset()
self.btn_open.setEnabled(True)
QMessageBox.critical(self, "로드 오류", error_msg)
self.status_bar.showMessage("로드 실패")
def cancel_loading(self):
"""로딩 취소"""
if hasattr(self, 'load_worker') and self.load_worker.isRunning():
self.load_worker.terminate()
self.progress_dialog.reset()
self.btn_open.setEnabled(True)
self.status_bar.showMessage("로딩 취소됨")
if __name__ == "__main__":
app = QApplication(sys.argv)
app.setStyle('Fusion')
viewer = DataViewer()
viewer.show()
sys.exit(app.exec_())