기초 통계와 데이터 시각화

글 한눈에 보기

문제 설정

기초 통계와 데이터 시각화에서 Seaborn 실습 흐름을 직접 따라가며 구현했습니다.

원본 구조

Seaborn 실습

데이터 맥락

특정 데이터셋 설명보다 Seaborn 실습 같은 실습 흐름을 직접 익히는 데 초점을 둔 노트입니다.

주요 장

Seaborn 실습

구현 흐름

CSV 데이터 불러오기 -> 데이터셋 불러오기 -> 파생 변수 추가

자료

ipynb / md · 코드 37 · 실행 35

주요 스택

matplotlib, warnings, numpy, seaborn 외 1

ㄹ!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

!apt-get update -qq
!apt-get install fonts-nanum* -qq

import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import warnings
warnings.filterwarnings(action='ignore')

path = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf' # 나눔 고딕
font_name = fm.FontProperties(fname=path, size=10).get_name() # 기본 폰트 사이즈 : 10
plt.rc('font', family=font_name)

fm.fontManager.addfont(path)

import numpy as np

x = [5, 7, 8, 9, 10, 12, 13, 14, 20]

q1 = np.percentile(x, 25)
q2 = np.percentile(x, 50)
q3 = np.percentile(x, 75)
iqr = q3 - q1

print("Q1: ", q1)
print("Q2(중앙값): ", q2)
print("Q3: ", q3)
print('IQR: ', iqr)

# 박스 프롯 - 이상치 확인

plt.figure(figsize=(4,3))
plt.boxplot(x, vert = False)
plt.title("사분위수 시각화")
plt.show()

x = [-100, 5, 7, 8, 9, 10, 12, 13, 14, 20, 79]

q1 = np.percentile(x, 25)
q2 = np.percentile(x, 50)
q3 = np.percentile(x, 75)
iqr = q3 - q1

print("Q1: ", q1)
print("Q2(중앙값): ", q2)
print("Q3: ", q3)
print('IQR: ', iqr)

# 박스 프롯 - 이상치 확인

plt.figure(figsize=(4,3))
plt.boxplot(x, vert = False)
plt.title("사분위수 시각화")
plt.show()

# 박스 프롯 - 커스터마이징 (1)

plt.figure(figsize=(7,3))
plt.boxplot(
    x,
    vert = False,
    patch_artist=True,                                        # 박스 색을 채워라
    boxprops=dict(facecolor = 'skyblue', color = 'blue'),     # 박스 스타일
    medianprops=dict(color = 'red', linewidth = 3),           # 중앙값 선 스타일
    whiskerprops=dict(color = 'gray', linestyle = '--'),      # 수염 스타일
    flierprops=dict(marker='*', markersize=8)                 # 이상치 스타일
    )
plt.title("사분위수 시각화", fontsize=15, fontweight='bold')
plt.grid(axis = 'x', linestyle = '--', alpha=0.4)
plt.yticks([1], ['Group A'])
plt.show()

# 박스 프롯 - 커스터마이징 (2)

box_style=dict(facecolor='skyblue', color='blue')       # 박스 스타일
median_style=dict(color='red', linewidth=3)             # 중앙값 선 스타일
whisker_style=dict(color='gray', linestyle='--')        # 수염 스타일
flier_style=dict(marker='*', markersize=8)              # 이상치 스타일

plt.figure(figsize=(7,3))
plt.boxplot(
    x,
    vert = False,
    patch_artist=True,                                  # 박스 색을 채워라
    boxprops=box_style,
    medianprops=median_style,
    whiskerprops=whisker_style,
    flierprops=flier_style
    )
plt.title("사분위수 시각화", fontsize=15, fontweight='bold')
plt.grid(axis = 'x', linestyle = '--', alpha=0.4)
plt.yticks([1], ['Group A'])
plt.show()

# 히스토그램 실습 (1)

weight = [68, 81, 64, 56, 78, 74, 61, 77, 66, 68, 59, 71,
          80, 59, 67, 81, 69, 73, 69, 74, 70, 65]

plt.figure(figsize=(8,4))

plt.hist(weight, label='bins=10')
plt.hist(weight, bins=20, label='bins=20')

plt.legend()
plt.title("두 개 히스토그램 예시")
plt.show()

# 히스토그램 실습 (2)

weight = [68, 81, 64, 56, 78, 74, 61, 77, 66, 68, 59, 71,
          80, 59, 67, 81, 69, 73, 69, 74, 70, 65]
x = [50, 66, 60, 53, 70, 76, 80, 90, 100]

plt.figure(figsize=(8,4))

plt.hist(x, label='bins=10 - x')
plt.hist(weight, label='bins=10 - weight', color='red', alpha=0.4)

plt.legend()
plt.title("두 개 히스토그램 예시")
plt.show()

# 히스토그램 실습 (3)

weight = [68, 81, 64, 56, 78, 74, 61, 77, 66, 68, 59, 71,
          80, 59, 67, 81, 69, 73, 69, 74, 70, 65]
x = [50, 66, 60, 53, 70, 76, 80, 90, 100]

plt.figure(figsize=(8,4))

plt.hist([weight, x],label=['weight', '데이터 x'], alpha=0.7)

plt.legend()
plt.title("두 개 히스토그램 예시")
plt.show()

# KDE plot
import seaborn as sns

x = [1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10]

plt.figure(figsize=(4,3))
sns.kdeplot(x, fill=True)
plt.title("KDE Plot")
plt.grid(True, alpha=0.3)
plt.show()

# 드라이브 마운트 코드
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/코드잇/AI 엔지니어 5기/공유폴더/Data/body.csv')
df

# 히스토그램 + KDE 그리기

plt.figure(figsize=(6,4))
sns.histplot(df, kde=True, bins=30, edgecolor='black')
plt.title("Histogram + KDE")
plt.grid(True, alpha = 0.4)
plt.show()

plt.figure(figsize=(4,4))
df['height'].plot(kind = 'kde')
plt.show()

plt.figure(figsize=(4,4))
df['height'].plot(kind = 'kde', bw_method=0.9)
plt.show()

plt.figure(figsize=(4,3))
sns.kdeplot(df['height'])
plt.show()

## 실습1 - 키와 몸무게 이상치 박스플롯 그리기
import numpy as np

# 모범답안 기본 - height
q1 = np.percentile(df['height'], 25)
q2 = np.percentile(df['height'], 50)
q3 = np.percentile(df['height'], 75)
iqr = q3 - q1

print("Q1:", q1)
print("Q2 (중앙값):", q2)
print("Q3:", q3)
print("IQR:", iqr)

plt.figure(figsize=(4,3))
plt.boxplot(df['height'], vert=False)
plt.title("height IQR")
plt.xlabel("값")
plt.show()

# 모범답안 기본 - weight
q1 = np.percentile(df['weight'], 25)
q2 = np.percentile(df['weight'], 50)
q3 = np.percentile(df['weight'], 75)
iqr = q3 - q1

print("Q1:", q1)
print("Q2 (중앙값):", q2)
print("Q3:", q3)
print("IQR:", iqr)

plt.figure(figsize=(4,3))
plt.boxplot(df['weight'], vert=False)
plt.title("weight IQR")
plt.xlabel("값")
plt.show()

Seaborn 실습

# 산점도

plt.figure(figsize=(5,3))
sns.scatterplot(data=df, x='height', y='weight')
plt.title('키 vs 몸무게 산점도')
plt.show()

# 히스토그램 - 구간 나누기

df['height_bin'] = pd.cut(df['height'], bins=range(150, 200, 5))
df['weight_bin'] = pd.cut(df['weight'], bins=range(40, 120, 10))

# barplot

plt.figure(figsize=(6,3))
sns.barplot(data=df, x='height_bin', y='weight', errorbar=None)
plt.title("키와 몸무게 평균 비교")
plt.xticks(rotation=45)
plt.show()

# 박스 플롯 - 구간별로

plt.figure(figsize=(6,3))
sns.boxplot(data=df, x='height_bin', y='weight')
plt.title('키 박스플롯')
plt.xticks(rotation=45)
plt.show()

# 바이올린 플롯: 박스플롯 + 데이터 밀도 결합 시각화

plt.figure(figsize=(10,4))
sns.violinplot(data=df, x='height_bin', y='weight')
plt.title('키 바이올린 플롯')
plt.xticks(rotation=45)
plt.show()

# 스트립플롯

plt.figure(figsize=(10,4))
sns.stripplot(data=df, x='height_bin', y='weight', jitter=True)
plt.title('키 스트립 플롯')
plt.xticks(rotation=45)
plt.show()

# 스왐플롯

plt.figure(figsize=(10,4))
sns.swarmplot(data=df, x='height_bin', y='weight')
plt.title('키 스왐플롯')
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(5,3))
sns.histplot(data=df, x='height')
plt.title("키 - 히스토그램")
plt.show()

# 조인트 플롯

plt.figure(figsize=(4,4))
sns.jointplot(data=df, x='height', y='weight', kind='scatter')
#plt.title('조인트 플롯')
plt.show()

# pairplot

sns.pairplot(df)
plt.show()

# 씨본 데이터 목록 - 명령문 기억하실 필요 XXXX
sns.get_dataset_names()

# 타이타닉 데이터셋 불러오기
titanic_data = sns.load_dataset('titanic')
titanic_data

씨본 스타일

팔레트

plt.figure(figsize=(4,3))
sns.set_palette(sns.color_palette("pastel")) # pastel, deep, muted....
sns.barplot(data=titanic_data, x='age', y='class', errorbar=None)
plt.show()

plt.figure(figsize=(4,3))
sns.set_palette('deep') # pastel, deep, muted....
sns.violinplot(data=titanic_data, x='age')
plt.show()

# set_theme()

plt.figure(figsize=(4,3))
sns.set_theme(style='whitegrid')
sns.violinplot(data=titanic_data, x='age')
plt.show()

상관관계

corr = titanic_data.select_dtypes(include="number").corr()        # 수치형 변수만 선택해서 상관관계

plt.figure(figsize=(6,6))
sns.heatmap(corr, cmap='coolwarm', annot=True)
plt.show()

# 결측치를 히트맵으로 시각화

sns.heatmap(titanic_data.isnull(), cbar=False, yticklabels=False)
plt.show()