Data Analysis & ML/시계열분석

[시계열분석] 시계열 변수 추출 실습(Python)(2) - 이동평균/지연값/증감폭/그룹화 (bike-sharing-demand dataset)

YSY^ 2021. 3. 3. 15:45

[시계열분석] 시계열 변수(빈도/추세/계절성/주기/시계열분해/더미변수/지연값) : ysyblog.tistory.com/179

[시계열분석] 시계열 변수 추출 실습(Python)(1) - 시계열 분해 (bike-sharing-demand dataset) : ysyblog.tistory.com/209

해당 포스팅은 위 포스팅들에 이어 진행됩니다.

 

이동평균(moving average) 계산

 

# comparison of several moving average values
pd.concat([raw_all[['count']],#시간정보(파랑)
           raw_all[['count']].rolling(24).mean(), #데일리 패턴을 보겠다.(주황색)
           raw_all[['count']].rolling(24*7).mean()], axis=1).plot(kind='line', figsize=(20,6), linewidth=3, fontsize=20, # 주별패턴을 보겠다(녹색).
                                                                  xlim=('2012-01-01', '2013-01-01'), ylim=(0,1000))
plt.title('Time Series of Target', fontsize=20)
plt.xlabel('Index', fontsize=15)
plt.ylabel('Demand', fontsize=15)
plt.show()

 

# fill nan as some values and merging
Y_count_Day = raw_all[['count']].rolling(24).mean()
Y_count_Day.fillna(method='ffill', inplace=True) #24개를 합쳤기 때문에 nan값이 있을거임 이를 채움
Y_count_Day.fillna(method='bfill', inplace=True)
Y_count_Day.columns = ['count_Day']
Y_count_Week = raw_all[['count']].rolling(24*7).mean()
Y_count_Week.fillna(method='ffill', inplace=True)
Y_count_Week.fillna(method='bfill', inplace=True)
Y_count_Week.columns = ['count_Week']
raw_all = pd.concat([raw_all, Y_count_Day], axis=1)
raw_all = pd.concat([raw_all, Y_count_Week], axis=1)
raw_all

 

 

증감폭 계산

# line plot of Y for specific periods 바뀌는 차이를 알고싶을때 diff()를 사용, 증감폭을 알 수 있음
raw_all[['count']].diff().plot(kind='line', figsize=(20,6), linewidth=3, fontsize=20,
                                 xlim=('2012-01-01', '2012-06-01'), ylim=(-1000,1000))
plt.title('Time Series of Target', fontsize=20)
plt.xlabel('Index', fontsize=15)
plt.ylabel('Demand', fontsize=15)
plt.show()

 

# diff of Y and merging
Y_diff = raw_all[['count']].diff()
Y_diff.fillna(method='ffill', inplace=True)
Y_diff.fillna(method='bfill', inplace=True)
Y_diff.columns = ['count_diff']
raw_all = pd.concat([raw_all, Y_diff], axis=1)
raw_all

 

그룹화하기(등급나누기)

# 온도로 그룹하기
raw_all[['temp']]
pd.cut(raw_all['temp'], 5) # 값들을 5개 그룹으로 나눔.(그룹 개수에 따라 값이 바뀜)

 

# split values as some group
raw_all['temp_group'] = pd.cut(raw_all['temp'], 10) #10개 그룹으로 나누기
raw_all

 

누적 분기 칼럼 만들기

# feature extraction of time information (년, 분기 추출)
raw_all['Year'] = raw_all.datetime.dt.year
raw_all['Quater'] = raw_all.datetime.dt.quarter
# tab를 누르면 dt에서 어떤 함수를 쓸 수 있을 지 알 수 있음
#누적 분기 구하기
raw_all['Quater_ver2'] = raw_all['Quater'] + (raw_all.Year - raw_all.Year.min()) * 4 
raw_all

 

  1. 그냥 분기 칼럼은, 연도 구분이 없음. -> 분기에 따른 해석을 보기 위함
  2. 하지만 누적분기칼럼은 연도 구문이 포함된 것임. -> 년도에 따른 분기별 해석을 보기 위함

분기 변수 더미변수화

pd.get_dummies(raw_all['Quater'])
pd.get_dummies(raw_all['Quater'], prefix='Quater_Dummy') # prefix : 칼럼에 이름 붙이기
pd.get_dummies(raw_all['Quater'], prefix='Quater_Dummy', drop_first=True) #더미를 만들때 카테고리 개수 -1 만큼 필요할때 사용
pd.concat([raw_all, pd.get_dummies(raw_all['Quater'], prefix='Quater_Dummy', drop_first=True)], axis=1) #기존 데이터에 합병

raw_all = pd.concat([raw_all, pd.get_dummies(raw_all['Quater'], prefix='Quater_Dummy', drop_first=True)

 

시간 정보 추출(datetime.dt 이용)

# feature extraction of time information
raw_all['Month'] = raw_all.datetime.dt.month
raw_all['Day'] = raw_all.datetime.dt.day
raw_all['Hour'] = raw_all.datetime.dt.hour
raw_all['DayofWeek'] = raw_all.datetime.dt.dayofweek
raw_all

지연값 추출

raw_all['count'].shift(1) #1시간(1 row) 지연값 -> 날짜가 오름차순일때 +

 

raw_all['count'].shift(-2) # 2시간 앞에값, 날짜가 내림차순이라면 -를 사용

 

# calculation of lags of Y
raw_all['count_lag1'] = raw_all['count'].shift(1)
raw_all['count_lag2'] = raw_all['count'].shift(2)

# fill nan as some values
raw_all['count_lag1'].fillna(method='bfill', inplace=True)
raw_all['count_lag2'].fillna(method='bfill', inplace=True)
raw_all

 

코드정리

### Feature engineering of default
def non_feature_engineering(raw): #feature_engineering 안했을때
    raw_nfe = raw.copy()
    if 'datetime' in raw_nfe.columns:
        raw_nfe['datetime'] = pd.to_datetime(raw_nfe['datetime'])
        raw_nfe['DateTime'] = pd.to_datetime(raw_nfe['datetime'])
    if raw_nfe.index.dtype == 'int64':
        raw_nfe.set_index('DateTime', inplace=True)
    # bring back
    # if raw_nfe.index.dtype != 'int64':
    #     raw_nfe.reset_index(drop=False, inplace=True)
    raw_nfe = raw_nfe.asfreq('H', method='ffill') #시간 축만 세팅
    return raw_nfe
# raw_rd = non_feature_engineering(raw_all)


### Feature engineering of all
def feature_engineering(raw):
    raw_fe = raw.copy()
    if 'datetime' in raw_fe.columns:
        raw_fe['datetime'] = pd.to_datetime(raw_fe['datetime'])
        raw_fe['DateTime'] = pd.to_datetime(raw_fe['datetime'])

    if raw_fe.index.dtype == 'int64':
        raw_fe.set_index('DateTime', inplace=True)

    raw_fe = raw_fe.asfreq('H', method='ffill')

    result = sm.tsa.seasonal_decompose(raw_fe['count'], model='additive')
    Y_trend = pd.DataFrame(result.trend)
    Y_trend.fillna(method='ffill', inplace=True)
    Y_trend.fillna(method='bfill', inplace=True)
    Y_trend.columns = ['count_trend']
    Y_seasonal = pd.DataFrame(result.seasonal)
    Y_seasonal.fillna(method='ffill', inplace=True)
    Y_seasonal.fillna(method='bfill', inplace=True)
    Y_seasonal.columns = ['count_seasonal']
    pd.concat([raw_fe, Y_trend, Y_seasonal], axis=1).isnull().sum()
    if 'count_trend' not in raw_fe.columns:
        if 'count_seasonal' not in raw_fe.columns:
            raw_fe = pd.concat([raw_fe, Y_trend, Y_seasonal], axis=1)

    Y_count_Day = raw_fe[['count']].rolling(24).mean()
    Y_count_Day.fillna(method='ffill', inplace=True)
    Y_count_Day.fillna(method='bfill', inplace=True)
    Y_count_Day.columns = ['count_Day']
    Y_count_Week = raw_fe[['count']].rolling(24*7).mean()
    Y_count_Week.fillna(method='ffill', inplace=True)
    Y_count_Week.fillna(method='bfill', inplace=True)
    Y_count_Week.columns = ['count_Week']
    if 'count_Day' not in raw_fe.columns:
        raw_fe = pd.concat([raw_fe, Y_count_Day], axis=1)
    if 'count_Week' not in raw_fe.columns:
        raw_fe = pd.concat([raw_fe, Y_count_Week], axis=1)

    Y_diff = raw_fe[['count']].diff()
    Y_diff.fillna(method='ffill', inplace=True)
    Y_diff.fillna(method='bfill', inplace=True)
    Y_diff.columns = ['count_diff']
    if 'count_diff' not in raw_fe.columns:
        raw_fe = pd.concat([raw_fe, Y_diff], axis=1)

    raw_fe['temp_group'] = pd.cut(raw_fe['temp'], 10)
    raw_fe['Year'] = raw_fe.datetime.dt.year
    raw_fe['Quater'] = raw_fe.datetime.dt.quarter
    raw_fe['Quater_ver2'] = raw_fe['Quater'] + (raw_fe.Year - raw_fe.Year.min()) * 4
    raw_fe['Month'] = raw_fe.datetime.dt.month
    raw_fe['Day'] = raw_fe.datetime.dt.day
    raw_fe['Hour'] = raw_fe.datetime.dt.hour
    raw_fe['DayofWeek'] = raw_fe.datetime.dt.dayofweek

    raw_fe['count_lag1'] = raw_fe['count'].shift(1)
    raw_fe['count_lag2'] = raw_fe['count'].shift(2)
    raw_fe['count_lag1'].fillna(method='bfill', inplace=True)
    raw_fe['count_lag2'].fillna(method='bfill', inplace=True)

    if 'Quater' in raw_fe.columns:
        if 'Quater_Dummy' not in ['_'.join(col.split('_')[:2]) for col in raw_fe.columns]:
            raw_fe = pd.concat([raw_fe, pd.get_dummies(raw_fe['Quater'], prefix='Quater_Dummy', drop_first=True)], axis=1)
            del raw_fe['Quater']
    return raw_fe
728x90
반응형