상세 컨텐츠

본문 제목

[금융데이터 스터디] 데이콘 "주식 종료 가격 예측 경진대회" 코드 리뷰 - 소뉸팀

심화 스터디/금융데이터

by 줴윤 2022. 5. 29. 15:05

본문

1. 설명

데이콘 주식 종료 가격 예측 경진대회 코드 공유에서 1위 소뉸팀의 코드ppt 자료를 참고하여 작성되었습니다.

 

2. 코드 분석

2.1. Import Module & Functions

1

!pip install pykrx
!pip install yfinance
!pip install finance-datareader

2

import os
import multiprocessing
import copy
import pickle
import warnings
from datetime import datetime, timedelta
from time import time, sleep, mktime
from matplotlib import font_manager as fm, rc, rcParams
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import re
import random

import numpy as np
from numpy import array, nan, random as rnd, where
import pandas as pd
from pandas import DataFrame as dataframe, Series as series, isna, read_csv
from pandas.tseries.offsets import DateOffset
import statsmodels.api as sm
from scipy.stats import f_oneway

from sklearn import preprocessing as prep
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split as tts, GridSearchCV as GridTuner, StratifiedKFold, KFold
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn import metrics
from sklearn.pipeline import make_pipeline

from sklearn import linear_model as lm
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as qda
from sklearn import svm
import lightgbm as lgb
import xgboost as xgb
from sklearn import neighbors as knn
from sklearn import ensemble
from sklearn.model_selection import TimeSeriesSplit


import tensorflow as tf

# ===== import functions =====
import sys
sys.path.append("projects/DA_Platform")
#from DA_v5 import *

3

# global setting
warnings.filterwarnings(action='ignore')
rcParams['axes.unicode_minus'] = False
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)

font_path = 'myfonts/NanumSquareB.ttf'
font_obj = fm.FontProperties(fname=font_path, size=12).get_name()
rc('font', family=font_obj)

folder_path = "projects/dacon_stockprediction/"
seed_everything()

4

# ===== task specific functions =====
from pykrx import stock
import yfinance as yf

# buy&sell signal
def getBreakthroughPoint(df, col1, col2, patient_days, fill_method="fb"):
    '''
    :param df: dataframe (including col1, col2)
    :param col1: obj
    :param col2: obj moving average
    :param patient_days: patient days detected as breakthrough point
    :return: signal series
    '''
    sigPrice = []
    flag = -1  # A flag for the trend upward/downward

    for i in range(0, len(df)):
        if df[col1][i] > df[col2][i] and flag != 1:
            tmp = df['Close'][i:(i + patient_days + 1)]
            if len(tmp) == 1:
                sigPrice.append("buy")
                flag = 1
            else:
                if (tmp.iloc[1:] > tmp.iloc[0]).all():
                    sigPrice.append("buy")
                    flag = 1
                else:
                    sigPrice.append(nan)
        elif df[col1][i] < df[col2][i] and flag != 0:
            tmp = df['Close'][i:(i + patient_days + 1)]
            if len(tmp) == 1:
                sigPrice.append("sell")
                flag = 0
            else:
                if (tmp.iloc[1:] < tmp.iloc[0]).all():
                    sigPrice.append("sell")
                    flag = 0
                else:
                    sigPrice.append(nan)
        else:
            sigPrice.append(nan)

    sigPrice = series(sigPrice)
    for idx, value in enumerate(sigPrice):
        if not isna(value):
            if value == "buy":
                sigPrice.iloc[1:idx] = "sell"
            else:
                sigPrice.iloc[1:idx] = "buy"
            break

    sigPrice.ffill(inplace=True)
    return sigPrice

2.2. Load Raw Data

5

from sklearn.kernel_ridge import KernelRidge

folder_path = "/content/drive/MyDrive/Colab Notebooks/dacon/closing price/"

# ====== raw data loading ======

#1-- Get Stock List
# 종목 코드 로드
stock_list = read_csv(folder_path + "open/stock_list.csv")
stock_list['종목코드'] = stock_list['종목코드'].apply(lambda x: str(x).zfill(6))


#2-- 종목명을 key로 dict(=stock_dict) 생성
stock_list.set_index("종목명", inplace=True)
selected_codes = stock_list.index.tolist()
stock_list = stock_list.loc[selected_codes]["종목코드"]
stock_dic = dict.fromkeys(selected_codes) 


#3-- 날짜 지정
start_date = '20190101'; end_date = '20211126'  # 2019 01월 01일 ~ 2021년 11월 26일 데이터 load


#4-- Original data loading
for stock_name, stock_code in tqdm(stock_list.items()):
    print("=====", stock_name, "=====")
    business_days = pd.DataFrame(pd.date_range(start_date, end_date, freq='B'), columns=['Date'])

    # 종목 주가 데이터 로드
    try:
        stock_dic[stock_name] = dict.fromkeys(["df", "target_list"])       
        # stock data
        stock_df = stock.get_market_ohlcv_by_date(start_date, end_date, stock_code).reset_index()
        # 기관합계&외국인 합계
        investor_df = stock.get_market_trading_volume_by_date(start_date, end_date, stock_code)[["기관합계", "외국인합계"]].reset_index()
        # kospi 종가
        kospi_df = stock.get_index_ohlcv_by_date(start_date, end_date, "1001")[["종가"]].reset_index()

        # columns name 변경
        stock_df.columns = ["Date", "Open", "High", "Low", "Close", "Volume"]
        investor_df.columns = ["Date", "inst", "fore"]
        kospi_df.columns = ["Date", "kospi"]
        business_days = business_days[business_days["Date"] >= stock_df["Date"][0]]

        # 영업일과 주가 정보를 outer 조인
        train_x = pd.merge(business_days, stock_df, how='left', on="Date")
        train_x = pd.merge(train_x, investor_df, how='left', on="Date")
        train_x = pd.merge(train_x, kospi_df, how='left', on="Date")

        # 앞의 일자로 nan값 forward fill
        train_x.iloc[:, 1:] = train_x.iloc[:, 1:].ffill(axis=0)
        # 첫 날이 na 일 가능성이 있으므로 backward fill 수행
        train_x.iloc[:, 1:] = train_x.iloc[:, 1:].bfill(axis=0)

    except:
        # 기간&외국인 합계 없는 경우
        stock_dic[stock_name] = dict.fromkeys(["df", "target_list"])
        stock_df = stock.get_market_ohlcv_by_date(start_date, end_date, stock_code).reset_index()
        kospi_df = stock.get_index_ohlcv_by_date(start_date, end_date, "1001")[["종가"]].reset_index()

        stock_df.columns = ["Date", "Open", "High", "Low", "Close", "Volume"]
        kospi_df.columns = ["Date", "kospi"]
        business_days = business_days[business_days["Date"] >= stock_df["Date"][0]]

        # 영업일과 주가 정보를 outer 조인
        train_x = pd.merge(business_days, stock_df, how='left', on="Date")
        train_x = pd.merge(train_x, kospi_df, how='left', on="Date")

        # 앞의 일자로 nan값 forward fill
        train_x.iloc[:, 1:] = train_x.iloc[:, 1:].ffill(axis=0)
        # 첫 날이 na 일 가능성이 있으므로 backward fill 수행
        train_x.iloc[:, 1:] = train_x.iloc[:, 1:].bfill(axis=0)
    stock_dic[stock_name]["df"] = train_x.copy()

6

# dataset dict를 pickle 저장
stock_dic = easyIO(None, folder_path + "dataset/stock_df_ori_" + start_date + "_" + end_date + ".pickle", op="r")

7

start_date_yf = '2019-01-01'
end_date_yf = '2021-11-26'

stock_dic = easyIO(None, folder_path + "dataset/stock_df_ori_" + start_date + "_" + end_date + ".pickle", op="r")

# 해외index 불러오기
forex_index_data = yf.download(["USDKRW=X", "USDAUD=X", "USDJPY=X", "EURUSD=X", "CNY=X", "^GSPC", "^DJI", "^IXIC", "^STOXX50E",
                                "^SOX",  "000001.SS", "000300.SS", "MME=F", "^TNX"], start=start_date_yf, end=end_date_yf, rounding=True)

tmp_forex_index = forex_index_data["Close"]
tmp_forex_index.index = pd.to_datetime(tmp_forex_index.index)
tmp_forex_index = tmp_forex_index[(tmp_forex_index.index >= pd.to_datetime(start_date)) & (tmp_forex_index.index <= pd.to_datetime(end_date))]
tmp_forex_index.columns = ["sse_composite_index", "csi300_index", "usdtocny", "eurtousd", "msci_emerging", "usdtoaud", "usdtojpy", "usdtokrw",
                           "dow", "snp500", "nasdaq", "semicon_index", "euro50", "us10y_tsy"]

tmp_forex_index.reset_index(drop=False, inplace=True)

2.3. Feature Engineering

8

# ===== feature engineering =====
non_stock = []
corr_list = []
timeunit_gap_forviz = 1
metric_days = 14
cat_vars = []
bin_vars = []
cat_vars.append("weekday")
cat_vars.append("weeknum")
bin_vars.append("mfi_signal")
num_pvalue_check = None
cat_pvalue_check = series(0, index=["weekday", "weeknum", "mfi_signal"])


for stock_name, stock_data in stock_dic.items():
    train_x = stock_data["df"].copy()

    # 1. 환율 및 관련 인덱스 feature 추가
    train_x = pd.merge(train_x, tmp_forex_index, how="left", on="Date")
    train_x = train_x.ffill() # 과거 일자로 forward fill 수행    
    train_x = train_x.bfill() # 첫 날이 nan 일 가능성이 있으므로 backward fill 수행


    # 2. 요일 및 주차 파생변수 추가
    train_x['weekday'] = train_x["Date"].apply(lambda x: x.weekday())
    train_x['weeknum'] = train_x["Date"].apply(lambda x: week_of_month(x))


    # 3. 거래대금 파생변수 추가
    train_x['trading_amount'] = train_x["Close"] * train_x["Volume"]

    # 4. 월별 주기성 특징을 잡기 위한 sin 및 cos 변환 파생변수 추가
    day_to_sec = 24 * 60 * 60
    month_to_sec = 20 * day_to_sec
    timestamp_s = train_x["Date"].apply(datetime.timestamp)
    timestamp_freq = round((timestamp_s / month_to_sec).diff(20)[20], 1)
    train_x['dayofmonth_freq_sin'] = np.sin((timestamp_s / month_to_sec) * ((2 * np.pi) / timestamp_freq))
    train_x['dayofmonth_freq_cos'] = np.cos((timestamp_s / month_to_sec) * ((2 * np.pi) / timestamp_freq))


    # 5. OBV 파생변수 추가
    # 매수 신호: obv > obv_ema
    # 매도 신호: obv < obv_ema
    obv = [0]
    for i in range(1, len(train_x.Close)):
        if train_x.Close[i] >= train_x.Close[i - 1]:
            obv.append(obv[-1] + train_x.Volume[i])
        elif train_x.Close[i] < train_x.Close[i - 1]:
            obv.append(obv[-1] - train_x.Volume[i])
    train_x['obv'] = obv
    train_x['obv'][0] = nan
    train_x['obv_ema'] = train_x['obv'].ewm(com=metric_days, min_periods=metric_days).mean()


    # 6. Stochastic 파생변수 추가
    # fast_d = moving average on fast_k
    train_x[["fast_k", "fast_d"]] = stochastic(train_x, n=metric_days)[["fast_k", "fast_d"]]


    # 7. MFI 파생변수 추가
    # MFI = 100 - (100 / 1 + MFR)
    # MFR = 14일간의 양의 MF / 14일간의 음의 MF
    # MF = 거래량 * (당일고가 + 당일저가 + 당일종가) / 3
    # MF 컬럼 만들기
    train_x["mf"] = train_x["Volume"] * ((train_x["High"]+train_x["Low"]+train_x["Close"]) / 3)
    # 양의 MF와 음의 MF 표기 컬럼 만들기
    p_n = []
    for i in range(len(train_x['mf'])):
        if i == 0 :
            p_n.append(nan)
        else:
            if train_x['mf'][i] >= train_x['mf'][i-1]:
                p_n.append('p')
            else:
                p_n.append('n')
    train_x['p_n'] = p_n
    # 14일간 양의 MF/ 14일간 음의 MF 계산하여 컬럼 만들기
    mfr = []
    for i in range(len(train_x['mf'])):
        if i < metric_days-1:
            mfr.append(nan)
        else:
            train_x_=train_x.iloc[(i-metric_days+1):i]
            a = (sum(train_x_['mf'][train_x['p_n'] == 'p']) + 1) / (sum(train_x_['mf'][train_x['p_n'] == 'n']) + 10)
            mfr.append(a)
    train_x['mfr'] = mfr
    # 최종 MFI 컬럼 만들기
    train_x['mfi'] = 100 - (100 / (1 + train_x['mfr']))
    train_x["mfi_signal"] = train_x['mfi'].apply(lambda x: "buy" if x > 50 else "sell")


    # 8. 이동평균 추가
    train_x["close_mv5"] = train_x["Close"].rolling(5, min_periods=5).mean()
    train_x["close_mv10"] = train_x["Close"].rolling(10, min_periods=10).mean()
    train_x["close_mv20"] = train_x["Close"].rolling(20, min_periods=20).mean()

    train_x["volume_mv5"] = train_x["Volume"].rolling(5, min_periods=5).mean()
    train_x["volume_mv10"] = train_x["Volume"].rolling(10, min_periods=10).mean()
    train_x["volume_mv20"] = train_x["Volume"].rolling(20, min_periods=20).mean()

    train_x["trading_amount_mv5"] = train_x["trading_amount"].rolling(5, min_periods=5).mean()
    train_x["trading_amount_mv10"] = train_x["trading_amount"].rolling(10, min_periods=10).mean()
    train_x["trading_amount_mv20"] = train_x["trading_amount"].rolling(20, min_periods=20).mean()

    train_x["kospi_mv5"] = train_x["kospi"].rolling(5, min_periods=5).mean()
    train_x["kospi_mv10"] = train_x["kospi"].rolling(10, min_periods=10).mean()
    train_x["kospi_mv20"] = train_x["kospi"].rolling(20, min_periods=20).mean()

    try:
        train_x["inst_mv5"] = train_x["inst"].rolling(5, min_periods=5).mean()
        train_x["inst_mv10"] = train_x["inst"].rolling(10, min_periods=10).mean()
        train_x["inst_mv20"] = train_x["inst"].rolling(20, min_periods=20).mean()

        # 기관 연속 순매수 일자 feature 생성
        cnt_consecutive = 0
        tmp_consecutive = []
        for i in train_x["inst"]:
            if i > 0:
                cnt_consecutive += 1
            else:
                cnt_consecutive = 0
            tmp_consecutive.append(cnt_consecutive)
        train_x["consec_inst"] = tmp_consecutive

        train_x["fore_mv5"] = train_x["fore"].rolling(5, min_periods=5).mean()
        train_x["fore_mv10"] = train_x["fore"].rolling(10, min_periods=10).mean()
        train_x["fore_mv20"] = train_x["fore"].rolling(20, min_periods=20).mean()

        # 외국인 연속 순매수 일자 feature 생성
        cnt_consecutive = 0
        tmp_consecutive = []
        for i in train_x["fore"]:
            if i > 0:
                cnt_consecutive += 1
            else:
                cnt_consecutive = 0
            tmp_consecutive.append(cnt_consecutive)
        train_x["consec_fore"] = tmp_consecutive
    except:
        pass


    # 9. 과거데이터 추가
    tmp_df = dataframe()
    tmp_cols = []
    #1~5일 전 종가 load
    for i in range(1,6,1):
        tmp_df = pd.concat([tmp_df, train_x["Close"].shift(i).to_frame()], axis=1)
        tmp_cols.append("close_" + str(i) + "shift")
    tmp_df.columns = tmp_cols
    train_x = pd.concat([train_x, tmp_df], axis=1)


    # 10. columns 정리
    # 지표계산을 위해 쓰인 컬럼 drop
    train_x.drop(["mf", "p_n", "mfr"], inplace=True, axis=1)
    # 컬럼이름 소문자 변환 및 정렬
    train_x.columns = train_x.columns.str.lower()
    train_x = pd.concat([train_x[["date"]], train_x.iloc[:,1:].sort_index(axis=1)], axis=1)


    # 11. create target list
    target_list = []
    target_list.append(train_x["close"].copy())
    target_list.append(train_x["close"].shift(-1))
    target_list.append(train_x["close"].shift(-2))
    target_list.append(train_x["close"].shift(-3))
    target_list.append(train_x["close"].shift(-4))
    target_list.append(train_x["close"].shift(-5))
    for idx, value in enumerate(target_list):
        value.name = "target_shift" + str(idx)


    # 12. onehot encoding & df save
    onehot_encoder = MyOneHotEncoder()
    train_x = onehot_encoder.fit_transform(train_x, cat_vars + bin_vars)
    stock_dic[stock_name]["df"] = train_x.copy()
    stock_dic[stock_name]["target_list"] = target_list


easyIO(stock_dic, folder_path + "dataset/stock_df_fe_" + start_date + "_" + end_date + ".pickle", op="w")

9

# ===== feature selection and feature scaling =====

# 1. Name of dummy variable colums
cat_vars_oh = ["weekday_0", "weekday_1", "weekday_2", "weekday_3", "weekday_4",
               "weeknum_1", "weeknum_2", "weeknum_3", "weeknum_4", "weeknum_5",
               'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6',
               'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12']
bin_vars_oh = ["mfi_signal_buy", "mfi_signal_sell"]
forex_index_vars = ["sse_composite_index", "csi300_index", "usdtocny", "eurtousd", "msci_emerging",
                    "usdtoaud", "usdtojpy", "usdtokrw", "dow", "snp500", "nasdaq", "semicon_index", "euro50", "us10y_tsy"]
month_var = ['month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12']


# 2. features 저장할 dict 생성
feature_set_dic = dict.fromkeys(range(1,50))
for i in feature_set_dic.keys(): feature_set_dic[i] = {}


# # All features
# feature_seed = 1
# feature_set_dic[feature_seed]["selected_features"] = ['date', 'close', 'close_1shift', 'close_2shift', 'close_3shift', 'close_4shift', 'close_5shift', 'close_mv10', 'close_mv20', 'close_mv5',
#                      'consec_fore', 'consec_inst', 'csi300_index', 'dayofmonth_freq_cos', 'dayofmonth_freq_sin', 'dow', 'euro50', 'eurtousd', 'fast_d', 'fast_k',
#                      'fore', 'fore_mv10', 'fore_mv20', 'fore_mv5', 'inst', 'inst_mv10', 'inst_mv20', 'inst_mv5', 'kospi', 'kospi_mv10', 'kospi_mv20', 'kospi_mv5',
#                      'mfi', 'mfi_signal_buy', 'mfi_signal_sell', 'msci_emerging', 'nasdaq', 'obv', 'obv_ema', 'semicon_index', 'snp500', 'sse_composite_index',
#                      'trading_amount', 'trading_amount_mv10', 'trading_amount_mv20', 'trading_amount_mv5', 'us10y_tsy', 'usdtoaud', 'usdtocny', 'usdtojpy', 'usdtokrw',
#                      'volume', 'volume_mv10', 'volume_mv20', 'volume_mv5', 'weekday_0', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4',
#                      'weeknum_1', 'weeknum_2', 'weeknum_3', 'weeknum_4', 'weeknum_5']
# feature_set_dic[feature_seed]["logtrans_vec"] = []

10

# 2. features 49가지 조합 중 가장 좋은 성능을 가지는 seed 7 load 

feature_seed = 7
# feature_set_dic[7]["selected_features"] = ["date", "close"] + ["weekday_0", "weekday_1", "weekday_2", "weekday_3", "weekday_4"]

feature_set_dic = easyIO(None, folder_path + "dataset/feature_dic.pkl", op="r")
selected_features = feature_set_dic[feature_seed]["selected_features"]
logtrans_vec = feature_set_dic[feature_seed]["logtrans_vec"]

feature_name = "feature_seed_" + str(feature_seed)
createFolder(folder_path + "result/" + feature_name + "/")
createFolder(folder_path + "submission/" + feature_name + "/")

# date cutoff config
date_cutoff = None
# anomaly cutoff config
anomaly_cutoff = None

11

# 3. 가격 데이터를 활용한 features & Scaling

for stock_name, stock_data in tqdm(stock_dic.items()):
    train_x = stock_data["df"].copy()

    train_x["month"] = train_x["date"].dt.month
    oh_encoder = MyOneHotEncoder()
    train_x = oh_encoder.fit_transform(train_x, ["month"])

    train_x["close_return"] = train_x["close"].pct_change()
    train_x["std20"] = train_x["close"].rolling(20).std()

    train_x["close_mean_2"] = (train_x["high"] + train_x["low"]) / 2
    train_x["close_mean_3"] = (train_x["high"] + train_x["low"] + train_x["close"]) / 3
    train_x["close_diff"] = train_x["close_mean_3"] - train_x["close_mean_2"]
    train_x["high_low_range"] = train_x["high"] - train_x["low"]
    train_x["high_low_range_return"] = train_x["high_low_range"].pct_change()

    train_x["high_to_close"] = train_x["high"] - train_x["close"]
    train_x["close_to_low"] = train_x["close"] - train_x["low"]

    train_x["snp500_return"] = train_x["snp500"].pct_change()
    train_x["dow_return"] = train_x["dow"].pct_change()
    train_x["nasdaq_return"] = train_x["nasdaq"].pct_change()
    train_x["semicon_index_return"] = train_x["semicon_index"].pct_change()

    train_x["sse_composite_index_return"] = train_x["sse_composite_index"].pct_change()
    train_x["usdtokrw_return"] = train_x["usdtokrw"].pct_change()
    train_x["eurtousd_return"] = train_x["eurtousd"].pct_change()
    train_x["us10y_tsy_return"] = train_x["us10y_tsy"].pct_change()

    train_x.replace(np.inf, 0, inplace=True)
    train_x.replace(-np.inf, 0, inplace=True)

    # 3.1 smoothing on target
    train_x["close_smoothing"] = 1
    if anomaly_cutoff is not None:
        tmp_list = []
        for idx, value in enumerate(train_x["close_return"]):
            if isna(value):
                tmp_list.append(train_x["close"].iloc[idx])
                continue
            if np.abs(value) > anomaly_cutoff:
                print("anomaly detected :", stock_name)
                if value >= 0:
                    tmp_list.append(train_x["close"].iloc[idx - 1] * (1 + anomaly_cutoff))
                else:
                    tmp_list.append(train_x["close"].iloc[idx - 1] * (1 - anomaly_cutoff))
            else:
                tmp_list.append(train_x["close"].iloc[idx])
        train_x["close_smoothing"] = series(tmp_list)
    train_x.ffill(inplace=True)

    # 3.2 <<feature selection>> ??
    if selected_features is not None:
        tmp_list = [i for i in selected_features if i in train_x.columns] + ["close_smoothing"]
        if len(tmp_list) > 1:
            train_x = train_x[tmp_list]

    if date_cutoff is not None:
        train_x = train_x[train_x["date"] >= date_cutoff]

    train_x = train_x.dropna()
    train_x.reset_index(drop=True, inplace=True)
    tmp_target = train_x["close_smoothing"] if anomaly_cutoff is not None else train_x["close"]
    train_x.drop("close_smoothing", axis=1, inplace=True)


    # 3.3 과거데이터 추가
    tmp_df = dataframe()
    tmp_cols = []
    for i in range(1, 6, 1):
        tmp_df = pd.concat([tmp_df, stock_data["df"]["close"].shift(i).to_frame()], axis=1)
        tmp_cols.append("close_" + str(i) + "shift")
    tmp_df.columns = tmp_cols
    stock_data["df"] = pd.concat([stock_data["df"], tmp_df], axis=1)


    # 3.4 create target list
    target_list = []
    target_list.append(tmp_target.copy())
    target_list.append(tmp_target.shift(-1))
    target_list.append(tmp_target.shift(-2))
    target_list.append(tmp_target.shift(-3))
    target_list.append(tmp_target.shift(-4))
    target_list.append(tmp_target.shift(-5))
    for idx, value in enumerate(target_list):
        value.name = "target_shift" + str(idx)

    for i in train_x.columns[1:]:
        train_x[i] = train_x[i].astype("float32")
        if len(np.where(np.isinf(train_x[i]))[0]) > 0:
            train_x[i][np.where(np.isinf(train_x[i]))[0]] = nan
            train_x[i].ffill(inplace=True)


    # 3.5 <feature scaling>
    # log transform
    for i in logtrans_vec:
        if i in train_x.columns:
            train_x[i] = train_x[i].apply(np.log1p)

    stock_dic[stock_name]["df"] = train_x.copy()
    stock_dic[stock_name]["target_list"] = target_list
    del train_x

2.4. Predict with Model

12

# 학습 전 필요 변수 초기화
kfolds_spliter = TimeSeriesSplit(n_splits=5, test_size=1, gap=0)

targetType = "numeric"
targetTask = None
class_levels = [0, 1]
cut_off = 0

ds = None
result_val = None
result_test = None

13

#Model 평가 결과 Linear가 예측성능이 가장 높다
# model_names = ["Linear", "KNN", "MLP_Desc_V2", "MLP_ResNet_V1"]
model_names = ["Linear"]


fit_runningtime = time()

# 1. 데이터를 저장할 변수 설정
total_perf = None
for stock_name, stock_data in stock_dic.items():
    stock_data["perf_list"] = dict.fromkeys(model_names)
    stock_data["pred_list"] = dict.fromkeys(model_names)
    stock_data["pred_list"]["best_pred"] = dict.fromkeys([1, 2, 3, 4, 5], [0])
    total_perf = dict.fromkeys(model_names)
    for i in model_names:
        stock_data["perf_list"][i] = dict.fromkeys([1, 2, 3, 4, 5], [0])
        stock_data["pred_list"][i] = dict.fromkeys([1, 2, 3, 4, 5], [0])
        total_perf[i] = dict.fromkeys([1, 2, 3, 4, 5], [0])
        for j in total_perf[i].keys():
            total_perf[i][j] = series(0, index=["MAE", "MAPE", "NMAE", "RMSE", "NRMSE", "R2", "Running_Time"])

target_timegap = 5
val_size = 5
seqLength = 5
val_year = 2021; val_month = 11; val_day = 19
test_year = 2021; test_month = 11; test_day = 26

14

import pandas_datareader as pdr

# 2. Model fit&predict
for time_ngap in range(1,target_timegap+1):
    print(F"=== Target on N+{time_ngap} ===")
    # time_ngap = 1

    # USD/JPY adjustment - 안전자산 선호심리 약,강 -> 급락,급등 파악
    start_date_yf = "/".join([str(val_month), str(val_day - 5), str(val_year)])
    end_date_yf = "/".join([str(val_month), str(val_day), str(val_year)])
    tmp_usdjpy = pdr.DataReader("DEXJPUS", "fred", start=start_date_yf, end=end_date_yf)
    tmp_usdjpy.pct_change()
    val_adj_usdjpy = round(tmp_usdjpy.ffill().pct_change().iloc[-1], 2)[0]

    start_date_yf = "/".join([str(test_month), str(test_day - 5), str(test_year)])
    end_date_yf = "/".join([str(test_month) ,str(test_day), str(test_year)])
    tmp_usdjpy = pdr.DataReader("DEXJPUS", "fred", start=start_date_yf, end=end_date_yf)
    test_adj_usdjpy = round(tmp_usdjpy.ffill().pct_change().iloc[-1], 2)[0]

    for stock_name, stock_data in tqdm(stock_dic.items()):

        tmp_x = stock_data["df"].copy()
        tmp_y = copy.deepcopy(stock_data["target_list"][time_ngap])
        tmp_date = tmp_x["date"]
        arima_target = stock_data["target_list"][0]
        arima_date = stock_data["df"]["date"]
        tmp_x.drop("date", axis=1, inplace=True)

        # <선형회귀>
        if "Linear" in model_names:
            if time_ngap in [1, 2, 3, 4, 5]:
                tmp_runtime = time()
                print("Linear Regression on", stock_name, "\n")
                # evaludation on validation set
                numericCols = [i for i in tmp_x.columns if i not in cat_vars_oh + bin_vars_oh]
                scaler_feature = StandardScaler()
                scaler_target = StandardScaler()

                train_x = tmp_x[tmp_date <= datetime(val_year, val_month, val_day)][:-val_size][:-time_ngap]
                train_y = tmp_y[tmp_date <= datetime(val_year, val_month, val_day)][:-val_size][:-time_ngap]

                tmp_anomaly = (train_y - train_x["close"]) / train_x["close"]
                tmp_new_target = []
                anomaly_cutoff = 0.1 * (1 + 0.2 * time_ngap)
                for idx, value in enumerate(tmp_anomaly):
                    if np.abs(value) > anomaly_cutoff:
                        if value >= 0:
                            tmp_new_target.append(train_x["close"].iloc[idx] * (1 + anomaly_cutoff))
                        else:
                            tmp_new_target.append(train_x["close"].iloc[idx] * (1 - anomaly_cutoff))
                    else:
                        tmp_new_target.append(train_y.iloc[idx])
                train_y = tmp_new_target

                val_x = tmp_x[tmp_date <= datetime(val_year, val_month, val_day)][-val_size:]
                val_y = tmp_y[tmp_date <= datetime(val_year, val_month, val_day)][-val_size:]

                # scaling
                # scaler_target.fit(train_x["close"].to_frame())
                # train_y = series(scaler_target.transform(train_y.to_frame())[:, 0])
                train_x[numericCols] = dataframe(scaler_feature.fit_transform(train_x[numericCols]), index=train_x.index, columns=numericCols)
                val_x[numericCols] = dataframe(scaler_feature.transform(val_x[numericCols]), index=val_x.index, columns=numericCols)

                model = lm.LinearRegression()
                model.fit(train_x, train_y)

                # # no adjustment
                # tmp_pred = model.predict(val_x)[:, np.newaxis].flatten()

                # adjustment
                if abs(val_adj_usdjpy) != 0:
                    tmp_pred = model.predict(val_x)[:, np.newaxis].flatten() * (1 + val_adj_usdjpy)
                else:
                    tmp_pred = model.predict(val_x)[:, np.newaxis].flatten()

                mae = metrics.mean_absolute_error(val_y, tmp_pred)
                rmse = metrics.mean_squared_error(val_y, tmp_pred, squared=False)
                stock_data["perf_list"]["Linear"][time_ngap] = {"MAE": mae,
                                                                "MAPE": metrics.mean_absolute_percentage_error(val_y, tmp_pred),
                                                                "NMAE": (mae / val_y.abs().mean()),
                                                                "RMSE": rmse,
                                                                "NRMSE": (rmse / val_y.abs().mean()),
                                                                "R2": metrics.r2_score(val_y, tmp_pred)}
                tmp_perf = series(stock_data["perf_list"]["Linear"][time_ngap])
                print(tmp_perf)

                # prediction on test set
                scaler_feature = StandardScaler()
                scaler_target = StandardScaler()

                full_x = tmp_x[tmp_date <= datetime(test_year, test_month, test_day)][:-time_ngap]
                full_y = tmp_y[tmp_date <= datetime(test_year, test_month, test_day)][:-time_ngap]

                tmp_anomaly = (full_y - full_x["close"]) / full_x["close"]
                tmp_new_target = []
                anomaly_cutoff = 0.1 * (1 + 0.2 * time_ngap)
                for idx, value in enumerate(tmp_anomaly):
                    if np.abs(value) > anomaly_cutoff:
                        if value >= 0:
                            tmp_new_target.append(full_x["close"].iloc[idx] * (1 + anomaly_cutoff))
                        else:
                            tmp_new_target.append(full_x["close"].iloc[idx] * (1 - anomaly_cutoff))
                    else:
                        tmp_new_target.append(full_y.iloc[idx])
                full_y = tmp_new_target

                test_x = tmp_x[tmp_date == datetime(test_year, test_month, test_day)]

                # scaling
                # scaler_target.fit(full_x["close"].to_frame())
                # full_y = series(scaler_target.transform(full_y.to_frame())[:, 0])
                full_x[numericCols] = dataframe(scaler_feature.fit_transform(full_x[numericCols]), index=full_x.index, columns=numericCols)
                test_x[numericCols] = dataframe(scaler_feature.transform(test_x[numericCols]), index=test_x.index, columns=numericCols)

                model = lm.LinearRegression()
                model.fit(full_x, full_y)
                # stock_data["pred_list"]["Linear"][time_ngap] = scaler_target.inverse_transform(model.predict(test_x)[:, np.newaxis]).flatten()
                stock_data["pred_list"]["Linear"][time_ngap] = model.predict(test_x)[:, np.newaxis].flatten()
                tmp_runtime = time() - tmp_runtime
                total_perf["Linear"][time_ngap] += tmp_perf.append(series({"Running_Time": tmp_runtime}))
                print("\n\n")


    for i in model_names:
        total_perf[i][time_ngap] /= len(stock_dic.keys())
fit_runningtime = time() - fit_runningtime

# prediction value check
print(stock_dic["삼성전자"]["pred_list"])
print(fit_runningtime)

15

# 3. 성능평가 테이블 생성
perf_table = dataframe(index=model_names, columns=["time_gap_" + str(i) for i in range(1,6)])
runningtime_table = dataframe(index=model_names, columns=["time_gap_" + str(i) for i in range(1,6)])
for i in list(total_perf.keys()):
    if array(list(total_perf[i].values())).sum() == 0:
        pass
    else:
        perf_table.loc[i] = dataframe(total_perf[i]).loc["NMAE"].values
        runningtime_table.loc[i] = dataframe(total_perf[i]).loc["Running_Time"].values

# NMAE = MAPE
perf_table = perf_table.iloc[:,:target_timegap]
perf_table = perf_table * 100
perf_table.loc["best_model"] = perf_table.min(axis=0)
perf_table["avg"] = perf_table.iloc[:,:5].mean(axis=1)
perf_table["std"] = perf_table.iloc[:,:5].std(axis=1)
perf_table["running_time"] = runningtime_table.mean(axis=1).append(series({"best_model": -1}))
print(perf_table)

16

# 4. export submission file
submission = read_csv("projects/dacon_stockprediction/sample_submission.csv")
for for_model in model_names:
    for i in submission.columns[1:]:
        tmp_list = []
        for j in stock_dic[stock_list.index[stock_list == i][0]]["pred_list"][for_model].values():
            tmp_list.append(j[0])
        submission[i][:5] = tmp_list
    submission.to_csv(folder_path + "submission/" + feature_name + "/" + feature_name + "_" + for_model + ".csv", index=False)

관련글 더보기

댓글 영역