一、每日涨跌统计以及每日收益率的特征
所用的数据是从2010年4月19日到2017年2月7日的if数据。总共有1652个交易日,有888个交易日是上涨的,而764个交易日下跌。
IF单日涨幅最大为13.29个点(大奇迹日),单日跌幅最大为8.9个点(大盘跌停)。
二、代码
import matplotlib.pyplot as plt import numpy as np import pandas as pd # df = rd.get_price('CSI300.INDX', '2005-01-01', '2015-07-25').reset_index()[['OpeningPx', 'ClosingPx']] df = pd.read_csv('if.csv',index_col =0) #当收盘价高于开盘价返回"True" up_and_down = df['ClosingPx'] - df['OpeningPx'] > 0 #每日收益率 rate_of_return = (df['ClosingPx'] - df['OpeningPx']) / df['OpeningPx'] #每日涨跌统计 up_and_down_statistic = up_and_down.value_counts() print(up_and_down_statistic) print("\n") # with plt.xkcd(): # fig = plt.figure(figsize=(10, 8)) # ax = fig.add_axes((0.1, 0.2, 0.8, 0.7)) # ax.bar([-0.125, 1.0-0.125], [up_and_down_statistic[0], up_and_down_statistic[1]], 0.25) # ax.spines['right'].set_color('none') # ax.spines['top'].set_color('none') # ax.xaxis.set_ticks_position('bottom') # ax.set_xticks([0, 1]) # ax.set_xlim([-0.5, 1.5]) # ax.set_ylim([0, up_and_down_statistic.max()]) # ax.set_xticklabels(['DOWN', 'UP']) # plt.yticks([]) # plt.title("DISTRIBUTION OF UP AND DOWN") # fig.text( # 0.5, 0.05, # 'DATA FROM RICEQUANT', # ha='center') # plt.show() print(rate_of_return.describe()) rate_of_return.plot(kind='line', style='k--', figsize=(15, 10), title='Daily Yield Changes Over Time Series') # plt.show() ## Frequency distribution of up and down rate_of_return.hist(bins=80, alpha=0.3, color='g', normed=True) ## Kernel Density Estimate rate_of_return.plot(kind='kde', xlim=[-0.1, 0.1], style='r', grid=True, figsize=(15, 10), title='Frequency Distribution Of Up And Down & Kernel Density Estimate Curve') plt.show()
三、机器学习
1.解决错误ValueError: Expected 2D array, got 1D array instead
这是由于在新版的sklearn中,所有的数据都应该是二维矩阵,哪怕它只是单独一行或一列(比如前面做预测时,仅仅只用了一个样本数据),所以需要使用.reshape(1,-1)进行转换。
参见:https://www.jianshu.com/p/60596270e94e
2.解决错误AttributeError: 'list' object has no attribute 'reshape'
原因是list不能使用reshape,需要将其转化为array,然后就可以使用reshape了。
参见:https://blog.csdn.net/yijiaobani/article/details/102957153
3.结果展示
4.代码
from __future__ import division from sklearn.ensemble import RandomForestClassifier from collections import deque import matplotlib.pyplot as plt import pandas as pd import numpy as np #df = rd.get_price('CSI300.INDX', '2005-01-01', '2015-07-25').reset_index()[['OpeningPx', 'ClosingPx']] df = pd.read_csv('if.csv',index_col =0) up_and_down = df['ClosingPx'] - df['OpeningPx'] > 0 win_ratio2 = [] samples_list = [x for x in range(300) if x != 0] window = 2 for samples in samples_list: clf = RandomForestClassifier() X = deque(maxlen = samples) y = deque(maxlen = samples) prediction = 0 test_num = 0 win_num = 0 current_index = 600 for current_index in range(current_index, len(up_and_down)-1, 1): fact = up_and_down[current_index+1] X.append(list(up_and_down[(current_index-window): current_index])) y.append(up_and_down[current_index]) if len(y) >= samples: test_num += 1 clf.fit(X, y) prediction = clf.predict(np.array(list(up_and_down[(current_index-window+1): current_index+1])).reshape(1, -1)) if prediction[0] == fact: win_num += 1 #print win_num/test_num win_ratio2.append(win_num/test_num) fig = plt.figure(figsize=(12, 10)) plt.plot(samples_list, win_ratio2,'ro--') plt.title('Win Ratio Of Different Number Of Training Samples') plt.show()
四、涨跌时间窗口的选择
(1)结果
稳定震荡于0.5扔硬币的概率左右。
(2)代码
from __future__ import division import pandas as pd import numpy as np from sklearn.ensemble import RandomForestClassifier from collections import deque import matplotlib.pyplot as plt #df = rd.get_price('CSI300.INDX', '2005-01-01', '2015-07-25').reset_index()[['OpeningPx', 'ClosingPx']] df = pd.read_csv('if.csv',index_col =0) up_and_down = df['ClosingPx'] - df['OpeningPx'] > 0 win_ratio = [] window_list = [x for x in range(200) if x != 0] for window in window_list: clf = RandomForestClassifier() X = deque(maxlen = 100) y = deque(maxlen = 100) prediction = 0 test_num = 0 win_num = 0 current_index = 400 for current_index in range(current_index, len(up_and_down)-1, 1): fact = up_and_down[current_index+1] X.append(list(up_and_down[(current_index-window): current_index])) y.append(up_and_down[current_index]) if len(y) >= 100: test_num += 1 clf.fit(X, y) prediction = clf.predict(np.array(list(up_and_down[(current_index-window+1): current_index+1])).reshape(1, -1)) if prediction[0] == fact: win_num += 1 #print win_num/test_num win_ratio.append(win_num/test_num) fig = plt.figure(figsize=(12, 10)) plt.plot(window_list, win_ratio, 'm') plt.show() #Win Ratio Of Different Time Window'
参考:https://www.ricequant.com/community/topic/103/ipython-notebook-research-alpha%E4%B8%8B%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E4%B8%80%E7%9E%A5-%E5%85%B3%E4%BA%8E%E8%B7%8C%E8%B7%8C%E6%B6%A8%E6%B6%A8%E7%9A%84%E6%80%9D%E8%80%83