天天看点

H2O+Spark 计算皮尔逊相关系数

#set_h2o_cluster("10.111.32.12", 54321) external h2o 集群中d的任意节点IP,
#set_client_ip("10.111.32.16") h2o 的ip  可以同一个主机 
conf = H2OConf(ss).set_external_cluster_mode().use_manual_cluster_start().set_h2o_cluster("10.111.32.12",54321).set_client_ip("10.111.32.16").set_cloud_name("test")
hc = H2OContext.getOrCreate(ss, conf)
conf =H2OConf(ss).set_external_cluster_mode().set('HADOOP_USER_NAME','dp').set_h2o_driver_path().set_user_name('dp').use_auto_cluster_start().set_num_of_external_h2o_nodes(1).set_mapper_xmx("2G").set_yarn_queue("h2o").set_cloud_name("h2o_gbm")

conf = H2OConf(ss).set_external_cluster_mode().set('HADOOP_USER_NAME', 'dp').set_h2o_driver_path(
    "/home/dp/h2odriver/h2odriver-sw2.3.18-hdp2.6-extended.jar").set_user_name(
    'dp').use_auto_cluster_start().set_num_of_external_h2o_nodes(2).set_mapper_xmx("6G").set_yarn_queue(
    "default").set_cloud_name("h2o_gbm")
conf = H2OConf(ss)
conf.set_num_h2o_workers(2)

hc = H2OContext.getOrCreate(ss, conf)


df_corr_h2o = hc.as_h2o_frame(df_corr,framename='df_corr_h2o')
temp_corr_features_list = df_corr_h2o.cor()

corr_list = Correlation.corr(df, 'features' ,method="spearman")

print("相关系数计算:",temp_corr_features_list,"    type ",type(temp_corr_features_list),"话费时间:",time.time()-start_time)

columns_cols = list(final_table_schema.keys())
if self.y_col in list(final_table_schema.keys()):
    columns_cols.remove(self.y_col)

columns_cols = list()
for k, v in final_table_schema.items():
    if k != self.y_col and v != 'timestamp':
        columns_cols.append(k)

           

继续阅读