天天看点

[Java] 数据分析--数据预处理

数据结构

  • 键-值对:HashMap
[Java] 数据分析--数据预处理
[Java] 数据分析--数据预处理
1 import java.io.File;
 2 import java.io.FileNotFoundException;
 3 import java.util.HashMap;
 4 import java.util.Scanner;
 5 
 6 public class HashMapExample {
 7     public static void main(String[] args) {
 8         File dataFile = new File("data/Countries.dat");
 9         HashMap<String,Integer> dataset = new HashMap();
10         try {
11             Scanner input = new Scanner(dataFile);
12             while (input.hasNext()) {
13                 String country = input.next();
14                 int population = input.nextInt();
15                 dataset.put(country, population);
16             }
17         } catch (FileNotFoundException e) {
18             System.out.println(e);
19         }
20         System.out.printf("dataset.size(): %d%n", dataset.size());
21         System.out.printf("dataset.get(\"Peru\"): %,d%n", dataset.get("Peru"));
22     }
23 }      

View Code

文件处理

  • csv文件
    • 将Map数据存入csv文件  
[Java] 数据分析--数据预处理
[Java] 数据分析--数据预处理
1 import java.io.File;
 2 import java.io.FileNotFoundException;
 3 import java.io.FileOutputStream;
 4 import java.io.IOException;
 5 import java.util.Map;
 6 import java.util.Scanner;
 7 import java.util.Set;
 8 import java.util.TreeMap;
 9 import org.apache.poi.hssf.usermodel.HSSFRow;
10 import org.apache.poi.hssf.usermodel.HSSFSheet;
11 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
12 
13 public class FromMapToExcel {
14     public static void main(String[] args) {
15         Map<String,Integer> map = new TreeMap();
16         load(map, "data/Countries.dat");
17         print(map);
18         storeXL(map, "data/Countries.xls", "Countries Worksheet");
19     }
20     
21     /** Loads the data from the specified file into the specified map.
22     */
23     public static void load(Map map, String fileSpec) {
24         File file = new File(fileSpec);
25         try {
26             Scanner input = new Scanner(file);
27             while (input.hasNext()) {
28                 String country = input.next();
29                 int population = input.nextInt();
30                 map.put(country, population);
31             }
32         } catch (FileNotFoundException e) {
33             System.out.println(e);
34         }
35     }
36     
37     public static void print(Map map) {
38         Set countries = map.keySet();
39         for (Object country : countries) {
40             Object population = map.get(country);
41             System.out.printf("%-10s%,12d%n", country, population);
42         }
43     }
44     
45     /** Stores the specified map in the specified worksheet of 
46         the specified Excel workbook file.
47      * @param map
48      * @param fileSpec
49      * @param sheet
50     */
51     public static void storeXL(Map map, String fileSpec, String sheet) {
52         try {
53             FileOutputStream out = new FileOutputStream(fileSpec);
54             HSSFWorkbook workbook = new HSSFWorkbook();
55             HSSFSheet worksheet = workbook.createSheet(sheet);
56             Set countries = map.keySet();
57             short rowNum = 0;
58             for (Object country : countries) {
59                 Object population = map.get(country);
60                 HSSFRow row = worksheet.createRow(rowNum);
61                 row.createCell(0).setCellValue((String)country);
62                 row.createCell(1).setCellValue((Integer)population);
63                 ++rowNum;
64             }
65             workbook.write(out);
66             out.flush();
67             out.close();
68         } catch (FileNotFoundException e) {
69             System.err.println(e);
70         } catch (IOException e) {
71             System.err.println(e);
72         }
73     }
74 }      
    • 读取csv文件
[Java] 数据分析--数据预处理
[Java] 数据分析--数据预处理
1 import java.io.File;
 2 import java.io.FileNotFoundException;
 3 import java.util.HashMap;
 4 import java.util.Scanner;
 5 
 6 public class ReadingCSVFiles {
 7     public static void main(String[] args) {
 8         File dataFile = new File("data/Countries.csv");
 9         try {
10             Scanner input = new Scanner(dataFile);
11             input.useDelimiter(",|\\s");
12             String column1 = input.next();
13             String column2 = input.next();
14             System.out.printf("%-10s%12s%n", column1, column2);
15             while (input.hasNext()) {
16                 String country = input.next();
17                 int population = input.nextInt();
18                 System.out.printf("%-10s%,12d%n", country, population);
19             }
20         } catch (FileNotFoundException e) {
21             System.out.println(e);
22         }
23     }
24 }      
    • 读取csv到Map
[Java] 数据分析--数据预处理
[Java] 数据分析--数据预处理
1 import static dawj.ch02.FromMapToExcel.print;
 2 import java.io.FileInputStream;
 3 import java.io.FileNotFoundException;
 4 import java.io.IOException;
 5 import java.util.Map;
 6 import java.util.TreeMap;
 7 import org.apache.poi.hssf.usermodel.HSSFCell;
 8 import org.apache.poi.hssf.usermodel.HSSFRow;
 9 import org.apache.poi.hssf.usermodel.HSSFSheet;
10 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
11 import org.apache.poi.ss.usermodel.DataFormatter;
12 import org.apache.poi.ss.usermodel.Row;
13 
14 public class FromExcelToMap {
15     public static void main(String[] args) {
16         Map map = loadXL("data/Countries.xls", "Countries Worksheet");
17         print(map);
18     }
19     
20     /** Returns a Map object containing the data from the specified 
21         worksheet in the specified Excel file.
22     */
23     public static Map loadXL(String fileSpec, String sheetName) {
24         Map<String,Integer> map = new TreeMap();
25         try {
26             FileInputStream stream = new FileInputStream(fileSpec);
27             HSSFWorkbook workbook = new HSSFWorkbook(stream);
28             HSSFSheet worksheet = workbook.getSheet(sheetName);
29             DataFormatter formatter = new DataFormatter();
30             for (Row row : worksheet) {
31                 HSSFRow hssfRow = (HSSFRow)row;
32                 HSSFCell cell = hssfRow.getCell(0);
33                 String country = cell.getStringCellValue();
34                 cell = hssfRow.getCell(1);
35                 String str = formatter.formatCellValue(cell);
36                 int population = (int)Integer.getInteger(str);
37                 map.put(country, population);
38             }
39         } catch (FileNotFoundException e) {
40             System.err.println(e);
41         } catch (IOException e) {
42             System.err.println(e);
43         }
44         return map;
45     }
46 }      
  • 解析JSON文件
[Java] 数据分析--数据预处理
[Java] 数据分析--数据预处理
1 import java.io.File;
 2 import java.io.FileInputStream;
 3 import java.io.FileNotFoundException;
 4 import java.io.IOException;
 5 import java.io.InputStream;
 6 import java.util.ArrayList;
 7 import java.util.HashMap;
 8 import javax.json.Json;
 9 import javax.json.stream.JsonParser;
10 import javax.json.stream.JsonParser.Event;
11 
12 public class ParsingJSONFiles {
13     public static void main(String[] args) {
14         File dataFile = new File("data/Books.json");
15         try {
16             InputStream stream = new FileInputStream(dataFile);
17             JsonParser parser = Json.createParser(stream);
18             Event event = parser.next();  // advance past START_OBJECT
19             HashMap<String,Object> map = getMap(parser);
20             System.out.println(map);
21             stream.close();
22         } catch (FileNotFoundException e) {
23             System.out.println(e);
24         } catch (IOException e) {
25             System.out.println(e);
26         }
27     }
28     
29     /*  Returns the HashMap parsed by the specified parser.
30         Called when event.equals(event.START_OBJECT):
31     */
32     public static HashMap getMap(JsonParser parser) {
33         HashMap<String,Object> map = new HashMap();
34         Event event = parser.next();  // advance past START_OBJECT
35         String key = parser.getString();
36         event = parser.next();       // advance past KEY_NAME
37         while (!event.equals(Event.END_OBJECT)) {
38             if (event.equals(Event.VALUE_STRING)) {
39                 String value = parser.getString();
40                 map.put(key, value);
41             } else if (event.equals(Event.VALUE_NUMBER)) {
42                 Integer value = parser.getInt();
43                 map.put(key, value);
44             } else if (event.equals(Event.START_ARRAY)) {
45                 ArrayList<String> list = getList(parser);
46                 map.put(key, list);
47             }
48             event = parser.next();
49             if (event.equals(Event.END_OBJECT)) {
50                 break;
51             }
52             key = parser.getString();
53             event = parser.next();
54         }
55         return map;
56     }
57     
58     /*  Returns the ArrayList parsed by the specified parser.
59         Called when event.equals(event.START_ARRAY):
60     */
61     public static ArrayList getList(JsonParser parser) {
62         ArrayList list = new ArrayList();
63         Event event = parser.next();  // advance past START_ARRAY
64         while (!event.equals(Event.END_ARRAY)) {
65             if (event.equals(Event.VALUE_STRING)) {
66                 list.add(parser.getString());
67                 event = parser.next();
68             } else if (event.equals(Event.START_OBJECT)) {
69                 HashMap<String,Object> map = getMap(parser);
70                 list.add(map);
71                 event = parser.next();
72             } else if (event.equals(Event.START_ARRAY)) {
73                 ArrayList subList = getList(parser);   //  recursion
74                 list.add(subList);
75                 event = parser.next();
76             }
77         }
78         return list;
79     }
80 }      

数据处理

  • 生成测试数据集
[Java] 数据分析--数据预处理
[Java] 数据分析--数据预处理
1 import java.io.File;
 2 import java.io.FileNotFoundException;
 3 import java.io.PrintWriter;
 4 import java.util.Random;
 5 
 6 public class GeneratingTestData {
 7     private static final int ROWS = 8, COLS = 5;
 8     private static final Random RANDOM = new Random();
 9     
10     public static void main(String[] args) {
11         File outputFile = new File("data/Output.csv");
12         try {
13             PrintWriter writer = new PrintWriter(outputFile);
14             for (int i = 0; i < ROWS; i++) {
15                 for (int j = 0; j < COLS-1; j++) {
16                     writer.printf("%.6f,", RANDOM.nextDouble());
17                 }
18                 writer.printf("%.6f%n", RANDOM.nextDouble());
19             }
20             writer.close();
21         } catch (FileNotFoundException e) {
22             System.err.println(e);
23         }
24     }
25 }      
  • 数据过滤
    • 需求:选择国土面积超过100万平米的内陆国家
    • 过程:数据为dat格式,先定义对应简单类country,再写程序将dat中数据存在country的Set中,最后做筛选

Country.java

[Java] 数据分析--数据预处理
[Java] 数据分析--数据预处理
1 import java.util.HashSet;
 2 import java.util.Scanner;
 3 
 4 class Country {
 5     protected String name;
 6     protected int population;
 7     protected int area;
 8     protected boolean landlocked;
 9 
10     /*  Constructs a new Country object from the next line being scanned.
11         If there are no more lines, the new object's fields are left null.
12     */
13     public Country(Scanner in) {
14         if (in.hasNextLine()) {
15             this.name = in.next();
16             this.population = in.nextInt();
17             this.area = in.nextInt();
18             this.landlocked = in.nextBoolean();
19         }
20     }
21 
22     @Override
23     public String toString() {
24         return String.format("%-10s %,12d %,12d %b", 
25                 name, population, area, landlocked);
26     }
27 }      

FilteringData.java

[Java] 数据分析--数据预处理
[Java] 数据分析--数据预处理
1 import java.io.File;
 2 import java.io.FileNotFoundException;
 3 import java.util.HashSet;
 4 import java.util.Scanner;
 5 import java.util.Set;
 6 import java.util.TreeMap;
 7 
 8 public class FilteringData {
 9     private static final int MIN_AREA = 1000000;  // one million 
10     public static void main(String[] args) {
11         File file = new File("data/Countries.dat");
12         Set<Country> dataset = readDataset(file);
13         
14         for (Country country : dataset) {
15             if (country.landlocked && country.area >= MIN_AREA) {
16                 System.out.println(country);
17             }
18         }
19     }
20     
21     public static Set readDataset(File file) {
22         Set<Country> set = new HashSet();
23         try {
24             Scanner input = new Scanner(file);
25             input.nextLine();  // read past headers
26             while (input.hasNextLine()) {
27                 set.add(new Country(input));
28             }
29             input.close();
30         } catch (FileNotFoundException e) {
31             System.out.println(e);
32         }
33         return set;
34     }
35 }      
  • 排序
    • 需求:将contries.dat中数据按population进行排序
    • 实现:将数据存入TreeMap
    • 注意:关键字段必须唯一,即两个国家人口不能相同
[Java] 数据分析--数据预处理
[Java] 数据分析--数据预处理
1 import java.io.File;
 2 import java.io.FileNotFoundException;
 3 import java.util.Collections;
 4 import java.util.HashMap;
 5 import java.util.Scanner;
 6 import java.util.Set;
 7 import java.util.TreeMap;
 8 
 9 public class SortingData {
10     public static void main(String[] args) {
11         File file = new File("src/main/java/com/hongfeng/SortingData/Countries.dat");
12         TreeMap<Integer,String> dataset = new TreeMap();
13         try {
14             Scanner input = new Scanner(file);
15             while (input.hasNext()) {
16                 String x = input.next();
17                 int y = input.nextInt();
18                 dataset.put(y, x);
19             }
20             input.close();
21         } catch (FileNotFoundException e) {
22             System.out.println(e);
23         }
24         print(dataset);
25     }
26     
27     public static void print(TreeMap<Integer,String> map) {
28         for (Integer key : map.keySet()) {
29             System.out.printf("%,12d  %-16s%n", key, map.get(key));
30         }
31     }
32 }      
  • 合并
    • 需求:将多个排好序的文件合并为单个排好序的文件
    • country类继承Comparable,定义从文件创建对象的构造方法,以及比较方法
    • 扫描两个文件,比较,存入新文件,一个文件扫描完后,另一个文件逐项扫描即可
[Java] 数据分析--数据预处理
[Java] 数据分析--数据预处理
1 import java.util.Scanner;
 2 
 3 class Country implements Comparable{
 4     protected String name;
 5     protected int population;
 6 
 7     /*  Constructs a new Country object from the next line being scanned.
 8         If there are no more lines, the new object's fields are left null.
 9     */
10     public Country(Scanner in) {
11         if (in.hasNextLine()) {
12             this.name = in.next();
13             this.population = in.nextInt();
14         }
15     }
16 
17     public boolean isNull(){
18         return this.name == null;
19     }
20 
21     @Override
22     public int compareTo(Object object){
23         Country that = (Country)object;
24         return this.population - that.population;
25     }
26 
27     @Override
28     public String toString() {
29         return String.format("%-10s %,12d",
30                 name, population);
31     }
32 }      
[Java] 数据分析--数据预处理
[Java] 数据分析--数据预处理
1 import java.io.File;
 2 import java.io.FileNotFoundException;
 3 import java.io.PrintWriter;
 4 import java.util.Scanner;
 5 
 6 public class MergingFiles {
 7     public static void main(String[] args) {
 8         File inFile1 = new File("data/Countries1.dat");
 9         File inFile2 = new File("data/Countries2.dat");
10         File outFile = new File("data/Countries.dat");
11         try {
12             Scanner in1 = new Scanner(inFile1);
13             Scanner in2 = new Scanner(inFile2);
14             PrintWriter out = new PrintWriter(outFile);
15             Country country1 = new Country(in1);
16             Country country2 = new Country(in2);  
17             System.out.println(country1.hashCode());
18             System.out.println(country2.hashCode());
19             while (!country1.isNull() && !country2.isNull()) {
20                 if (country1.compareTo(country2) < 0) {
21                     out.println(country1);
22                     country1 = new Country(in1);
23                 } else {
24                     out.println(country2);
25                     country2 = new Country(in2);
26                 }
27             }
28             while (!country1.isNull()) {
29                 out.println(country1);
30                 country1 = new Country(in1);
31             }
32             while (!country2.isNull()) {
33                 out.println(country2);
34                 country2 = new Country(in2);
35             }
36             in1.close();
37             in2.close();
38             out.close();
39         } catch (FileNotFoundException e) {
40             System.out.println(e);
41         }
42     }
43 }