AICE Associate는 AI역량을 위해 KT랑 한국경제신문이 함께 주관하는 민간자격입니다.
주로 탐색적 데이터 분석, 데이터 전처리, 머신러닝/딥러닝 모델링, 모델 성능평가를 보는 과목입니다.
- AICE ASSOCIATE
14문항 / 90분
온라인 시험
80점 이상 합격
시험비 : 80,000원
자격 유효 기간 : 3년
- 중요 팁 5가지!
1. lebel, 원핫인코딩, train_test_split 꼭 나온다!
2. 문제에 옵션을 그대로 반영한다!
3. 매번 다른 모델이 나온다!
4. process는 똑같다!
5. 딥러닝 2문제가 나오는데, 모델링이 배점이 더 높다!
별칭으로 불러오기
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
로딩
df = pd.read_csv('voc_data.csv')
데이터프레임 앞부분, 뒷부분 불러오기
df.head(5), df.tail(5)
데이터프레임 정보
df.info() # 컬럼 정보
df.dtypes # 컬럼들 데이터타입
df.columns # 컬럼 확인
인덱스, 컬럼, 값
df.index
df.columns, list(df)
df.values, df.value_counts()
df.shape
통계치, null값 갯수 확인, 건수 확인
df.describe()
df.isnull().sum()
df['voc_trt_perd_itg_cd'].value_counts()
데이터 파일 합치기
df = pd.merge(df_a, df_b, how='inner', on='RID')
_가 차지하는 비율이 50% 넘는것을 확인하고 삭제
column_list = [column for column in df1.columns if (df1[column] == '_').mean() > 0.5]
df1.drop(columns=column_list, inplace=True)
_이 몇개 있는지 확인
count = (df1['cust_clas_itg_cd'] == "_").sum()
count
_값을 Null값으로 변경하고 저장, 값 바꾸기
df2 = df1.replace('_', np.nan)
df3['Churn'] = df2['Churn'].replace({'Yes': 1, 'No': 0})
df3['Churn'] = df2['Churn'].replace(['Yes','No'],[1,0])
df3['Churn'] = df2['Churn'].map({'Yes': 1, 'No': 0})
df3['Churn'] = np.where(df2['Churn'] == 'Yes', 1, 0)
최빈값으로 null값 채우기
mode = df[''].value_counts(dropna=True).idxmax()
mode = df[''].mode()[0]
df2 = df.copy()
df2[''] = df2[''].fillna(mode)
중앙값으로 null값 채우기
df[''] = pd.to_numeric(df3[''], errors='coerce')
median_value = df[''].median()
df2 = df.copy()
df2[''] = df2[''].fillna(median_value).astype(int)
컬럼 삭제
df.drop([''], axis = 1, inplace=True)
타입 바꾸기
df1['TotalCharges'] = df1['TotalCharges'].astype('float')
object 타입의 열만 가져오기
object_cols = [column for column in df4.columns if df4[column].dtype == 'object']
object_cols = df4.select_dtypes(include='object').columns.tolist()
정확도 출력
from sklearn.metrics import accuracy_score
y_test_pred = model.predict(x_test_scaled)
y_test_pred = np.argmax(y_test_pred, axis=1)
accuracy_score(y_test, y_test_pred)
원핫인코딩
object_cols = [column for column in df4.columns if df4[column].dtype == 'object']
df5 = pd.get_dummies(df4, columns=object_cols, drop_first=True, dtype=int)
데이터 나누기
from sklearn.model_selection import train_test_split
target = 'Churn'
X = df5.drop(columns=target)
y = df5.loc[:,target]
X_train, X_valid, y_train, y_valid = train_test_split(X,y, test_size=0.2, stratify=y, random_state=42)
정규화
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)
정규화2
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
모델들
# 로지스틱
from sklearn.linear_model import LogisticRegression
model_lg = LogisticRegression(C=10, max_iter=2000)
model_lg.fit(X_train, y_train)
model_lg.score(X_train, y_train)
y_pred1 = model_lg.predict(X_valid)
# KNN
from sklearn.neighbors import KNeighborsClassifier
model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(X_train, y_train)
model_knn.score(X_train, y_train)
y_pred2 = model_knn.predict(X_valid)
# Decisiontree
from sklearn.tree import DecisionTreeClassifier
model_dt = DecisionTreeClassifier(max_depth=5, min_samples_split=3, random_state=120)
model_dt.fit(X_train, y_train)
model_dt.score(X_train, y_train)
y_pred3 = model_dt.predict(X_valid)
# RandomForest
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(n_estimators=3, random_state=42)
model_rf.fit(X_train, y_train)
model_rf.score(X_train, y_train)
y_pred4 = model_rf.predict(X_valid)
# XGBoost
from xgboost import XGBClassifier
model_xgb = XGBClassifier(n_estimators=3, random_state=42)
model_xgb.fit(X_train, y_train)
model_xgb.score(X_train, y_train)
y_pred5 = model_xgb.predict(X_valid)
# Light GBM
from lightgbm import LGBMClassifier
model_lgbm = LGBMClassifier(n_estimators=3, random_state=42)
model_lgbm.fit(X_train, y_train)
model_lgbm.score(X_train, y_train)
y_pred6 = model_lgbm.predict(X_valid)
# Linear Regression
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_data, y_data)
model.score(x_data, y_data)
model.predict([[5]])
예측 결과
y_pred = model.predict(X_valid)
y_pred = np.where(y_pred >= 0.5, 1, 0)
accuracy = accuracy_score(y_valid, y_pred)
원핫인코딩 후
from keras.utils import to_categorical
clear_session()
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
nfeatures = x_train_scaled.shape[1]
model = Sequential([Input(shape=(nfeatures,)),
Dense(64, activation='relu'),
Dropout(0.2),
Dense(32, activation='relu'),
Dropout(0.2),
Dense(16, activation='relu'),
Dropout(0.2),
Dense(2, activation='softmax')])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
mc = ModelCheckpoint('best_model.keras', monitor='val_loss', save_best_only=True)
hist = model.fit(x_train_scaled, y_train, batch_size=10, epochs=10, callbacks=[es,mc], validation_data=(x_test_scaled, y_test), verbose=1)
그냥 sequential
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
# 딥러닝 모델 정의
model = Sequential()
model.add(Input(shape=(X_train.shape[1],)))
model.add(Dense(64, activation='relu', ))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# 모델 학습
history = model.fit(X_train, y_train, epochs=30, batch_size=16, validation_data=(X_valid, y_valid))
성능평가
# 혼동 행렬 생성 및 시각화
cm = confusion_matrix(y_valid, y_pred)
plt.figure(figsize=(5,5))
sns.heatmap(cm,
annot=True,
cmap='Blues',
cbar=False,
square=True,
fmt='d')
plt.yticks(rotation=360)
plt.xticks(rotation=360)
plt.show()
# 분류 보고서 출력
print(classification_report(y_valid, y_pred))
학습 정확도와 검증 정확도 표시
plt.figure(figsize=(10, 5))
plt.plot(hist.history['accuracy'])
plt.plot(hist.history['val_accuracy'])
plt.title('Accuracy : Training vs Validation')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Training', 'Validation'], loc=0)
plt.show()
학습 손실과 검증 손실 표시
plt.figure(figsize=(10, 5))
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title('Loss : Training vs Validation')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(['Training', 'Validation'], loc=0)
plt.show()
MSE 표시
import matplotlib.pyplot as plt
train_mse = history.history['mse']
val_mse = history.history['val_mse']
plt.figure(figsize=(10, 6))
plt.plot(train_mse, label='train mse', color='blue')
plt.plot(val_mse, label='val mse', color='orange')
plt.title('Model MSE')
plt.xlabel('Epochs')
plt.ylabel('MSE')
plt.legend()
plt.show()