这学期参加了服务外包大赛,具体要实现对非结构化数据的分析处理,所以在这里把这个过程一点点记录一下。
首先根据python的爬虫框架,从网页上获取了中文文本
![](https://img.laitimes.com/img/__Qf2AjLwojIjJCLyojI0JCLiAzNvwVZ2x2bzNXak9CX90TQNNkRrFlQKBTSvwFbslmZvwFMwQzLcVmepNHdu9mZvwFVywUNMZTY18CX052bm9CX9M2MYVDdykFbKJDTwYVbiVHNHpleO1GTulzRilWO5x0LcRHelR3LcJzLctmch1mclRXY39jNxkTN0cjMwEjNxMDM4EDMy8CX0Vmbu4GZzNmLn9Gbi1yZtl2Lc9CX6MHc0RHaiojIsJye.jpg)
但是由于我不怎么会处理中文数据,摸索了很久,简单的通过java的substring把数据分开
package se;
import java.io.File;
import java.io.InputStreamReader;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileWriter;
public class sdf {
public static void main(String args[]) {
try {
String pathname = "info.txt";
File filename = new File(pathname);
InputStreamReader reader = new InputStreamReader(
new FileInputStream(filename));
BufferedReader br = new BufferedReader(reader);
String line = "";
File writename = new File("output1.txt"); // 相对路径,如果没有则要建立一个新的output。txt文件
writename.createNewFile(); // 创建新文件
BufferedWriter out = new BufferedWriter(new FileWriter(writename));
line = br.readLine();
// System.out.println(line);
out.write(line);
out.write("\r\n");
out.write(" \r\n");
while (line != null) {
line = br.readLine(); // 一次读入一行数据
// System.out.println(line);
int b=;
for(int i=;i<line.length();i++)
{
if(line.substring(i,i+).equalsIgnoreCase(":"))
b=i;
}
// System.out.println(b);
// System.out.println(line.length());
if(b==||b==line.length()-)continue;
else
{
System.out.print(line.substring(,b));
out.write(line.substring(,b));
//out.flush();
for(int i=;i<=20-b;i++)
{
System.out.printf(" ");
out.write(" ");
}
System.out.print(line.substring(b+, line.length()));
out.write(line.substring(b+, line.length()));
out.write("\r\n");
//System.out.printf("\t");
System.out.printf("\n");
out.flush();
}
//System.out.println(b);
}
out.close(); // 最后记得关闭文件
} catch (Exception e) {
e.printStackTrace();
}
}
}
然后再将数据分开,由于中间有空格,导入到excel中