[시계열분석] 시계열 변수(빈도/추세/계절성/주기/시계열분해/더미변수/지연값) : ysyblog.tistory.com/179
[시계열분석] 시계열 변수 추출 실습(Python)(1) - 시계열 분해 (bike-sharing-demand dataset) : ysyblog.tistory.com/209
해당 포스팅은 위 포스팅들에 이어 진행됩니다.
이동평균(moving average) 계산
# comparison of several moving average values
pd.concat([raw_all[['count']],#시간정보(파랑)
raw_all[['count']].rolling(24).mean(), #데일리 패턴을 보겠다.(주황색)
raw_all[['count']].rolling(24*7).mean()], axis=1).plot(kind='line', figsize=(20,6), linewidth=3, fontsize=20, # 주별패턴을 보겠다(녹색).
xlim=('2012-01-01', '2013-01-01'), ylim=(0,1000))
plt.title('Time Series of Target', fontsize=20)
plt.xlabel('Index', fontsize=15)
plt.ylabel('Demand', fontsize=15)
plt.show()
# fill nan as some values and merging
Y_count_Day = raw_all[['count']].rolling(24).mean()
Y_count_Day.fillna(method='ffill', inplace=True) #24개를 합쳤기 때문에 nan값이 있을거임 이를 채움
Y_count_Day.fillna(method='bfill', inplace=True)
Y_count_Day.columns = ['count_Day']
Y_count_Week = raw_all[['count']].rolling(24*7).mean()
Y_count_Week.fillna(method='ffill', inplace=True)
Y_count_Week.fillna(method='bfill', inplace=True)
Y_count_Week.columns = ['count_Week']
raw_all = pd.concat([raw_all, Y_count_Day], axis=1)
raw_all = pd.concat([raw_all, Y_count_Week], axis=1)
raw_all
증감폭 계산
# line plot of Y for specific periods 바뀌는 차이를 알고싶을때 diff()를 사용, 증감폭을 알 수 있음
raw_all[['count']].diff().plot(kind='line', figsize=(20,6), linewidth=3, fontsize=20,
xlim=('2012-01-01', '2012-06-01'), ylim=(-1000,1000))
plt.title('Time Series of Target', fontsize=20)
plt.xlabel('Index', fontsize=15)
plt.ylabel('Demand', fontsize=15)
plt.show()
# diff of Y and merging
Y_diff = raw_all[['count']].diff()
Y_diff.fillna(method='ffill', inplace=True)
Y_diff.fillna(method='bfill', inplace=True)
Y_diff.columns = ['count_diff']
raw_all = pd.concat([raw_all, Y_diff], axis=1)
raw_all
그룹화하기(등급나누기)
# 온도로 그룹하기
raw_all[['temp']]
pd.cut(raw_all['temp'], 5) # 값들을 5개 그룹으로 나눔.(그룹 개수에 따라 값이 바뀜)
# split values as some group
raw_all['temp_group'] = pd.cut(raw_all['temp'], 10) #10개 그룹으로 나누기
raw_all
누적 분기 칼럼 만들기
# feature extraction of time information (년, 분기 추출)
raw_all['Year'] = raw_all.datetime.dt.year
raw_all['Quater'] = raw_all.datetime.dt.quarter
# tab를 누르면 dt에서 어떤 함수를 쓸 수 있을 지 알 수 있음
#누적 분기 구하기
raw_all['Quater_ver2'] = raw_all['Quater'] + (raw_all.Year - raw_all.Year.min()) * 4
raw_all
- 그냥 분기 칼럼은, 연도 구분이 없음. -> 분기에 따른 해석을 보기 위함
- 하지만 누적분기칼럼은 연도 구문이 포함된 것임. -> 년도에 따른 분기별 해석을 보기 위함
분기 변수 더미변수화
pd.get_dummies(raw_all['Quater'])
pd.get_dummies(raw_all['Quater'], prefix='Quater_Dummy') # prefix : 칼럼에 이름 붙이기
pd.get_dummies(raw_all['Quater'], prefix='Quater_Dummy', drop_first=True) #더미를 만들때 카테고리 개수 -1 만큼 필요할때 사용
pd.concat([raw_all, pd.get_dummies(raw_all['Quater'], prefix='Quater_Dummy', drop_first=True)], axis=1) #기존 데이터에 합병
raw_all = pd.concat([raw_all, pd.get_dummies(raw_all['Quater'], prefix='Quater_Dummy', drop_first=True)
시간 정보 추출(datetime.dt 이용)
# feature extraction of time information
raw_all['Month'] = raw_all.datetime.dt.month
raw_all['Day'] = raw_all.datetime.dt.day
raw_all['Hour'] = raw_all.datetime.dt.hour
raw_all['DayofWeek'] = raw_all.datetime.dt.dayofweek
raw_all
지연값 추출
raw_all['count'].shift(1) #1시간(1 row) 지연값 -> 날짜가 오름차순일때 +
raw_all['count'].shift(-2) # 2시간 앞에값, 날짜가 내림차순이라면 -를 사용
# calculation of lags of Y
raw_all['count_lag1'] = raw_all['count'].shift(1)
raw_all['count_lag2'] = raw_all['count'].shift(2)
# fill nan as some values
raw_all['count_lag1'].fillna(method='bfill', inplace=True)
raw_all['count_lag2'].fillna(method='bfill', inplace=True)
raw_all
코드정리
### Feature engineering of default
def non_feature_engineering(raw): #feature_engineering 안했을때
raw_nfe = raw.copy()
if 'datetime' in raw_nfe.columns:
raw_nfe['datetime'] = pd.to_datetime(raw_nfe['datetime'])
raw_nfe['DateTime'] = pd.to_datetime(raw_nfe['datetime'])
if raw_nfe.index.dtype == 'int64':
raw_nfe.set_index('DateTime', inplace=True)
# bring back
# if raw_nfe.index.dtype != 'int64':
# raw_nfe.reset_index(drop=False, inplace=True)
raw_nfe = raw_nfe.asfreq('H', method='ffill') #시간 축만 세팅
return raw_nfe
# raw_rd = non_feature_engineering(raw_all)
### Feature engineering of all
def feature_engineering(raw):
raw_fe = raw.copy()
if 'datetime' in raw_fe.columns:
raw_fe['datetime'] = pd.to_datetime(raw_fe['datetime'])
raw_fe['DateTime'] = pd.to_datetime(raw_fe['datetime'])
if raw_fe.index.dtype == 'int64':
raw_fe.set_index('DateTime', inplace=True)
raw_fe = raw_fe.asfreq('H', method='ffill')
result = sm.tsa.seasonal_decompose(raw_fe['count'], model='additive')
Y_trend = pd.DataFrame(result.trend)
Y_trend.fillna(method='ffill', inplace=True)
Y_trend.fillna(method='bfill', inplace=True)
Y_trend.columns = ['count_trend']
Y_seasonal = pd.DataFrame(result.seasonal)
Y_seasonal.fillna(method='ffill', inplace=True)
Y_seasonal.fillna(method='bfill', inplace=True)
Y_seasonal.columns = ['count_seasonal']
pd.concat([raw_fe, Y_trend, Y_seasonal], axis=1).isnull().sum()
if 'count_trend' not in raw_fe.columns:
if 'count_seasonal' not in raw_fe.columns:
raw_fe = pd.concat([raw_fe, Y_trend, Y_seasonal], axis=1)
Y_count_Day = raw_fe[['count']].rolling(24).mean()
Y_count_Day.fillna(method='ffill', inplace=True)
Y_count_Day.fillna(method='bfill', inplace=True)
Y_count_Day.columns = ['count_Day']
Y_count_Week = raw_fe[['count']].rolling(24*7).mean()
Y_count_Week.fillna(method='ffill', inplace=True)
Y_count_Week.fillna(method='bfill', inplace=True)
Y_count_Week.columns = ['count_Week']
if 'count_Day' not in raw_fe.columns:
raw_fe = pd.concat([raw_fe, Y_count_Day], axis=1)
if 'count_Week' not in raw_fe.columns:
raw_fe = pd.concat([raw_fe, Y_count_Week], axis=1)
Y_diff = raw_fe[['count']].diff()
Y_diff.fillna(method='ffill', inplace=True)
Y_diff.fillna(method='bfill', inplace=True)
Y_diff.columns = ['count_diff']
if 'count_diff' not in raw_fe.columns:
raw_fe = pd.concat([raw_fe, Y_diff], axis=1)
raw_fe['temp_group'] = pd.cut(raw_fe['temp'], 10)
raw_fe['Year'] = raw_fe.datetime.dt.year
raw_fe['Quater'] = raw_fe.datetime.dt.quarter
raw_fe['Quater_ver2'] = raw_fe['Quater'] + (raw_fe.Year - raw_fe.Year.min()) * 4
raw_fe['Month'] = raw_fe.datetime.dt.month
raw_fe['Day'] = raw_fe.datetime.dt.day
raw_fe['Hour'] = raw_fe.datetime.dt.hour
raw_fe['DayofWeek'] = raw_fe.datetime.dt.dayofweek
raw_fe['count_lag1'] = raw_fe['count'].shift(1)
raw_fe['count_lag2'] = raw_fe['count'].shift(2)
raw_fe['count_lag1'].fillna(method='bfill', inplace=True)
raw_fe['count_lag2'].fillna(method='bfill', inplace=True)
if 'Quater' in raw_fe.columns:
if 'Quater_Dummy' not in ['_'.join(col.split('_')[:2]) for col in raw_fe.columns]:
raw_fe = pd.concat([raw_fe, pd.get_dummies(raw_fe['Quater'], prefix='Quater_Dummy', drop_first=True)], axis=1)
del raw_fe['Quater']
return raw_fe
728x90
반응형