資料初始化
output_images = output + "/images"
output_labels = output + "/labels"
imageRDD = None
labelRDD = None
讀取CSV資料
def fromCSV(s):
"""将csv資料轉化為vector"""
return [float(x) for x in s.split(',') if len(s) > 0]
imageRDD = sc.textFile(output_images).map(fromCSV)
labelRDD = sc.textFile(output_labels).map(fromCSV)
讀取pickle資料
imageRDD = sc.pickleFile(output_images)
labelRDD = sc.pickleFile(output_labels)
讀取tfrecord資料
tfRDD = sc.newAPIHadoopFile(output, "org.tensorflow.hadoop.io.TFRecordFileInputFormat",
keyClass="org.apache.hadoop.io.BytesWritable",
valueClass="org.apache.hadoop.io.NullWritable")
imageRDD = tfRDD.map(lambda x: fromTFExample(bytes(x[0])))
資料轉化在另外一篇部落格
資料轉化