랭귀지/pandas
df비교
유키공
2025. 3. 27. 16:47
import pandas as pd
import numpy as np
# 예시 데이터 (NaN 포함)
df1 = pd.DataFrame({'A': [1, 2, np.nan], 'B': ['a', 'b', 'c']}, index=[0, 1, 2])
df2 = pd.DataFrame({'A': [1, 2, 4], 'B': ['a', 'x', np.nan]}, index=[1, 2, 3])
# 1. merge 실행 (outer join)
merged = pd.merge(
df1.reset_index(drop=True),
df2.reset_index(drop=True),
how='outer',
indicator='_source',
on=list(df1.columns),
suffixes=('', '_y')
)
# 2. 다른 행 필터링
diff_rows = merged[merged['_source'] != 'both'].copy()
diff_rows['_source'] = diff_rows['_source'].replace({
'left_only': 'df1',
'right_only': 'df2'
})
# 3. NaN 안전 처리 + 하이라이트
for idx, row in diff_rows.iterrows():
source = row['_source']
other_df = df2 if source == 'df1' else df1
pos = idx
try:
other_row = other_df.iloc[pos]
for col in df1.columns:
val = row[col]
other_val = other_row[col]
# NaN 비교 안전 처리 (pd.isna() 사용)
if (pd.isna(val) and not pd.isna(other_val)) or \
(not pd.isna(val) and pd.isna(other_val)) or \
(not pd.isna(val) and not pd.isna(other_val) and val != other_val):
diff_rows.at[idx, col] = f"{val} ({source})" if not pd.isna(val) else f"NaN ({source})"
except IndexError:
pass
# 4. 최종 결과
diff_rows = diff_rows[df1.columns.tolist() + ['_source']]
print(diff_rows)