資料擷取
全部指數資料從中證指數公司擷取
加載資料
使用pandas 解析所有excel, 儲存每個指數對應的成分股的權重,資料中的一些指數隻包含前十大權重股的權重,對這些指數中其他成分股的權重按照平均配置設定的方式設定。
load_xls.py
#-*- coding: utf-8 -*-
"""
File Name: load_xls.py
Author: ce39906
mail: [email protected]
Created Time: 2019-03-01 11:35:07
"""
import pandas as pd
import numpy as np
def load_close_weight_xls(filename):
print("loading ", filename)
df = pd.read_excel(filename,
converters={'指數代碼Index Code' : lambda x: str(x)})
index_name = df['指數名稱Index Name'].values[0]
index_code = df['指數代碼Index Code'].values[0]
constituent_code = df['成分券代碼Constituent Code'].values
weight = df['權重(%)Weight(%)'].values
n = weight.shape[0]
constituent_code_2_weight = {}
for i in range(n):
constituent_code_2_weight[constituent_code[i]] = weight[i]
return index_name, index_code, constituent_code_2_weight
def load_cons_xls(filename):
print("loading ", filename)
sheet1_df = pd.read_excel(filename,
converters={'指數代碼Index Code' : lambda x: str(x)})
index_name = sheet1_df['指數名稱Index Name'].values[0]
index_code = sheet1_df['指數代碼Index Code'].values[0]
constituent_code = sheet1_df['成分券代碼Constituent Code'].values
sheet2_df = pd.read_excel(filename, sheet_name='weight')
top10_code_2_weight = {}
for index, row in sheet2_df.iterrows():
code = row['代碼']
weight = row['權重']
top10_code_2_weight[code] = weight
total_count = constituent_code.shape[0]
top10_weight_count = len(top10_code_2_weight.keys())
top10_weight_sum = 0.0
for top10_weight in top10_code_2_weight.values():
top10_weight_sum += top10_weight
default_weight = (100.0 - top10_weight_sum) / (total_count - top10_weight_count)
constituent_code_2_weight = {}
for i in range(total_count):
code = constituent_code[i]
if code in top10_code_2_weight:
constituent_code_2_weight[code] = top10_code_2_weight[code]
else:
constituent_code_2_weight[code] = default_weight
return index_name, index_code, constituent_code_2_weigh
向量化以及繪制熱力圖
向量化的方式是統計出所有指數全部成分股作為向量的全部次元。
針對每個指數對應的向量,首先将向量值全部設定為0,然後在向量上為成分股對應的次元指派。
将每個指數都向量化後,合并到一個pandas dataframe中,dataframe中的每一列代表一個指數對應的向量。擷取dataframe後使用corr()方法獲得協方差矩陣,然後使用seaborn 繪制。
cal_stock_indices_correlation.py
#-*- coding: utf-8 -*-
"""
File Name: cal_stock_indices_correlation.py
Author: ce39906
mail: [email protected]
Created Time: 2019-03-01 14:31:30
"""
import load_xls
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
def load_data():
stock_indices_data = {}
constituent_codes = set()
for filename in os.listdir('data'):
if not filename.endswith('.xls'):
continue
if len(filename) == 21:
index_name, index_code, constituent_code_2_weight = load_xls.load_close_weight_xls('data/' + filename)
else:
index_name, index_code, constituent_code_2_weight = load_xls.load_cons_xls('data/' + filename)
for constituent_code in constituent_code_2_weight.keys():
constituent_codes.add(constituent_code)
stock_indices_data[index_name + '_' + str(index_code)] = constituent_code_2_weight
return list(constituent_codes), stock_indices_data
def vectorization(constituent_codes, stock_indices_data):
n = len(constituent_codes)
df = pd.DataFrame()
for stock_index_name, weights in stock_indices_data.items():
vector = [0.0] * n
for i in range(n):
code = constituent_codes[i]
if code in weights:
vector[i] = weights[code] / 100.0
df[stock_index_name] = vector
return df
def plot_corr_heatmap(df):
corr = df.corr()
f, ax = plt.subplots(figsize=(15, 15))
sns.heatmap(corr,
cmap='rainbow',
linewidths=0.05,
ax=ax,
square=True,
annot=True)
f.savefig('stock_indices_corr_heatmap.png')
def main():
constituent_codes, stock_indices_data = load_data()
df = vectorization(constituent_codes, stock_indices_data)
plot_corr_heatmap(df)
if __name__ == '__main__':
main()
熱力圖
