카테고리 없음

csv reader

유키공 2025. 4. 30. 13:08
import sys
import os
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import json
from PyQt5.QtWidgets import (
    QApplication, QMainWindow, QTableView, QFileDialog,
    QVBoxLayout, QWidget, QPushButton, QLabel,
    QStatusBar, QMessageBox, QLineEdit, QHBoxLayout,
    QComboBox, QHeaderView, QProgressDialog
)
from PyQt5.QtCore import (
    Qt, QAbstractTableModel, QSortFilterProxyModel, 
    QThread, pyqtSignal, QObject
)

class LoadWorker(QThread):
    """데이터 로딩 작업 스레드 (Dict 오류 처리 추가)"""
    progress = pyqtSignal(int)
    finished = pyqtSignal(pd.DataFrame)
    error = pyqtSignal(str)
    
    def __init__(self, file_path):
        super().__init__()
        self.file_path = file_path
        
    def safe_json_dumps(self, obj):
        """Dict/List 타입을 안전하게 JSON 문자열로 변환"""
        try:
            if isinstance(obj, (dict, list)):
                return json.dumps(obj, ensure_ascii=False)
            return str(obj)
        except:
            return "[Conversion Error]"

    def convert_complex_types(self, df):
        """DataFrame 내의 복합 타입(dict, list)을 문자열로 변환"""
        for col in df.columns:
            try:
                # 첫 번째 행의 값으로 타입 체크
                sample = df[col].iloc[0] if len(df) > 0 else None
                
                if isinstance(sample, (dict, list)):
                    df[col] = df[col].apply(self.safe_json_dumps)
                elif not pd.api.types.is_string_dtype(df[col]):
                    df[col] = df[col].astype(str)
            except Exception as e:
                print(f"컬럼 {col} 처리 오류: {e}")
                df[col] = df[col].astype(str)
        return df

    def run(self):
        try:
            self.progress.emit(5)
            
            if self.file_path.endswith('.parquet'):
                # Parquet 파일 로드
                parquet_file = pq.ParquetFile(self.file_path)
                num_row_groups = parquet_file.num_row_groups
                chunks = []
                
                for i in range(num_row_groups):
                    self.progress.emit(10 + int((i+1)/num_row_groups*70))
                    table = parquet_file.read_row_group(i)
                    df = table.to_pandas()
                    df = self.convert_complex_types(df)  # Dict/List 처리
                    chunks.append(df)
                
                self.progress.emit(90)
                result_df = pd.concat(chunks, ignore_index=True)
                
            else:
                # CSV 파일 로드
                chunksize = 100000
                chunks = []
                total_rows = sum(1 for _ in open(self.file_path, 'r', encoding='utf-8')) - 1
                processed_rows = 0
                
                for chunk in pd.read_csv(self.file_path, chunksize=chunksize):
                    progress = 10 + int(processed_rows / total_rows * 70)
                    self.progress.emit(progress)
                    chunk = self.convert_complex_types(chunk)  # Dict/List 처리
                    chunks.append(chunk)
                    processed_rows += len(chunk)
                
                self.progress.emit(90)
                result_df = pd.concat(chunks, ignore_index=True)
            
            self.progress.emit(95)
            result_df = result_df.fillna("")  # NULL 값 처리
            self.progress.emit(100)
            self.finished.emit(result_df)
            
        except Exception as e:
            error_msg = f"로딩 실패: {str(e)}\n\n{traceback.format_exc()}"
            self.error.emit(error_msg)

class DataFrameModel(QAbstractTableModel):
    """Dict 타입을 안전하게 처리하는 데이터 모델"""
    def __init__(self, data):
        super().__init__()
        self._data = data

    def rowCount(self, parent=None):
        return len(self._data)

    def columnCount(self, parent=None):
        return len(self._data.columns)

    def data(self, index, role=Qt.DisplayRole):
        if not index.isValid():
            return None

        value = self._data.iloc[index.row(), index.column()]

        if role == Qt.DisplayRole:
            return str(value) if not pd.isna(value) else ""
        elif role == Qt.BackgroundRole:
            if isinstance(value, (dict, list)):
                return QColor(240, 248, 255)  # 복합 타입 배경색
            return QColor(255, 255, 255)
        return None

    def headerData(self, section, orientation, role):
        if role == Qt.DisplayRole:
            if orientation == Qt.Horizontal:
                return str(self._data.columns[section])
            return str(self._data.index[section])
        return None

class DataViewer(QMainWindow):
    def __init__(self):
        super().__init__()
        self.setWindowTitle("Data Viewer with Dict Handling")
        self.setGeometry(100, 100, 1200, 800)
        self.setup_ui()
        
    def setup_ui(self):
        self.central_widget = QWidget()
        self.setCentralWidget(self.central_widget)
        layout = QVBoxLayout(self.central_widget)
        
        # 컨트롤 패널
        control_panel = QWidget()
        control_layout = QHBoxLayout(control_panel)
        
        self.btn_open = QPushButton("파일 열기")
        self.btn_open.clicked.connect(self.open_file)
        control_layout.addWidget(self.btn_open)
        
        self.search_input = QLineEdit()
        self.search_input.setPlaceholderText("검색어 입력")
        control_layout.addWidget(self.search_input)
        
        self.column_combo = QComboBox()
        self.column_combo.addItem("모든 컬럼")
        control_layout.addWidget(self.column_combo)
        
        layout.addWidget(control_panel)
        
        # 테이블 뷰
        self.table_view = QTableView()
        self.table_view.setSortingEnabled(True)
        self.proxy_model = QSortFilterProxyModel()
        self.proxy_model.setFilterCaseSensitivity(Qt.CaseInsensitive)
        self.table_view.setModel(self.proxy_model)
        layout.addWidget(self.table_view)
        
        # 상태바
        self.status_bar = QStatusBar()
        self.setStatusBar(self.status_bar)
        
        # 로딩 다이얼로그
        self.progress_dialog = QProgressDialog("파일을 로드 중입니다...", "취소", 0, 100, self)
        self.progress_dialog.setWindowModality(Qt.WindowModal)
        self.progress_dialog.canceled.connect(self.cancel_loading)
        
    def open_file(self):
        file_path, _ = QFileDialog.getOpenFileName(
            self, "파일 열기", "",
            "데이터 파일 (*.parquet *.csv);;모든 파일 (*)")
            
        if file_path:
            self.load_data(file_path)
    
    def load_data(self, file_path):
        """데이터 로드 및 진행률 표시"""
        self.progress_dialog.reset()
        self.progress_dialog.show()
        self.btn_open.setEnabled(False)
        
        self.load_worker = LoadWorker(file_path)
        self.load_worker.progress.connect(self.update_progress)
        self.load_worker.finished.connect(self.data_load_complete)
        self.load_worker.error.connect(self.data_load_error)
        self.load_worker.start()
    
    def update_progress(self, value):
        """진행률 업데이트"""
        self.progress_dialog.setValue(value)
        
    def data_load_complete(self, df):
        """데이터 로드 완료 처리"""
        self.progress_dialog.reset()
        self.btn_open.setEnabled(True)
        
        # 데이터 모델 설정
        model = DataFrameModel(df)
        self.proxy_model.setSourceModel(model)
        
        # 컬럼 목록 업데이트
        self.column_combo.clear()
        self.column_combo.addItem("모든 컬럼")
        self.column_combo.addItems(df.columns.tolist())
        
        # 상태바 업데이트
        file_size = os.path.getsize(self.load_worker.file_path) / (1024 * 1024)  # MB 단위
        self.status_bar.showMessage(
            f"로드 완료: {len(df):,}행 | {len(df.columns)}열 | {file_size:.2f}MB | "
            f"Dict/List 컬럼: {self.count_complex_columns(df)}개"
        )
    
    def count_complex_columns(self, df):
        """Dict/List 타입 컬럼 수 카운트"""
        count = 0
        for col in df.columns:
            sample = df[col].iloc[0] if len(df) > 0 else None
            if isinstance(sample, (dict, list)):
                count += 1
        return count
    
    def data_load_error(self, error_msg):
        """데이터 로드 오류 처리"""
        self.progress_dialog.reset()
        self.btn_open.setEnabled(True)
        QMessageBox.critical(self, "로드 오류", error_msg)
        self.status_bar.showMessage("로드 실패")
    
    def cancel_loading(self):
        """로딩 취소"""
        if hasattr(self, 'load_worker') and self.load_worker.isRunning():
            self.load_worker.terminate()
        self.progress_dialog.reset()
        self.btn_open.setEnabled(True)
        self.status_bar.showMessage("로딩 취소됨")

if __name__ == "__main__":
    app = QApplication(sys.argv)
    app.setStyle('Fusion')
    viewer = DataViewer()
    viewer.show()
    sys.exit(app.exec_())