天天看点

数据清洗——箱形图的实现

最近做华为的比赛,需要对数据进行清洗,查阅资料后选择了箱形图的方法,挑选出的异常值,用线性插值,即用邻近的两个数的和除以2替换,下面是代码实现。

import java.util.ArrayList;

public class BoxPlot {
	/**
	 * 箱形图
	 * @param data
	 */
	public static double[] boxPlot(double[] data){		
		

		ArrayList<Integer> removeList= new ArrayList<Integer>();
		ArrayList<Double> dataList= new ArrayList<Double>();
		
		double[] dataClone = data.clone(); 
		sort(data,0,data.length-1);
		//中位数
		double index2 =  (double)data.length/2;
		double Q2 =  ( ((int)index2+1- index2 )*data[(int)index2] + (index2 -(int)index2)*data[(int)index2+1]);
		//下四分位数
		double index1  =  (double)data.length/4;		
		double Q1 =   ( (int)index1+1- index1 )*data[(int)index1] + (index1 -(int)index1)*data[(int)index1+1];
		//上四分位数
		double index3  =  (double)data.length*3/4;	
		double Q3 =   ( (int)index3+1- index3 )*data[(int)index3] + (index3 -(int)index3)*data[(int)index3+1];
		int t=0;
		//如果上下四分位数相同,则改变index3
		while(Q3==Q1){
			
			int index = (int)index3+t;
			if(index==data.length){
				
				return dataClone;
				
			}
				
			Q3 =   data[(int)index3+t];
			t++;
		}
		
	
		double IQR = Q3-Q1;
		double k =2.8;
		double upperLimit = Q3+k*IQR;//上限
		double lowerLimit = Q1-k*IQR;//下限
//		System.out.println("Q1:"+Q1+" Q2:"+Q2+" Q3:"+Q3+" uLim:"+upperLimit+" lLim:"+lowerLimit);
		
		double sum = 0;
		int count = 0;
		/**=================================补上均值=========================================================***/
//		for(int i=0;i<dataClone.length;i++){
//			
//			if(dataClone[i]>upperLimit || dataClone[i]<lowerLimit){
//				dataClone[i] = -1;//标记异常值
//			}else{
//				sum += dataClone[i];
//				count++;
//			}
//		}
//		double ave = sum/count;
//		//填入均值
//		for(int i=0;i<dataClone.length;i++){
//			
//			if(dataClone[i] == -1)
//				dataClone[i] = ave;			
//		}
		
		/**=================================补上中位数=========================================================***/
		
		for(int i=0;i<dataClone.length;i++){
			dataList.add(dataClone[i]);
		}
//		System.out.println("删除的元素:");
		for(int i=dataList.size()-1;i>=0;i--){
			double tmp = dataList.get(i);
			if(tmp>upperLimit || tmp<lowerLimit){
//				System.out.print(tmp+" ");
				removeList.add(i);//添加删除的index,index从大到小,5,4,1
				dataList.remove(i);
			}
		}
		System.out.println();
		if(removeList.size()>0){
			//补上元素,倒序 ,index从小到大的添加,1,3,5
			for(int i=removeList.size()-1;i>=0;i--){
				double addElem;			
				int tmp = removeList.get(i);//index	
//				System.out.println("删除的位置index:"+tmp);
				if(tmp==dataList.size()){//说明删除的是最后一个元素
					addElem = dataList.get(tmp-1);//补上前一个元素
					
					dataList.add(tmp, addElem);
				}else if(tmp==0){
					addElem = dataList.get(tmp);
//					System.out.println("index:"+tmp+","+dataList.get(tmp));
					dataList.add(tmp, addElem);
				}else{
					addElem = dataList.get(tmp-1)+dataList.get(tmp);
//					System.out.println("删除的位置index:"+tmp+","+dataList.get(tmp-1)+","+dataList.get(tmp));
					addElem /=2;
					dataList.add(tmp, addElem);
				}	
//				System.out.println("补上的元素是:"+addElem);
			}		
		}

		double[] res = new double[dataList.size()];
		for(int i=0;i<res.length;i++){
			res[i] = dataList.get(i);
		}

		return res;
	}
	
	 public static double[] sort(double[] a,int low,int high){
	        int mid = (low+high)/2;
	        if(low<high){
	            sort(a,low,mid);
	            sort(a,mid+1,high);
	            //左右归并
	            merge(a,low,mid,high);
	        }
	        return a;
	    }
	     
	    public static void merge(double[] a, int low, int mid, int high) {
	    	double[] temp = new double[high-low+1];
	        int i= low;
	        int j = mid+1;
	        int k=0;
	        // 把较小的数先移到新数组中
	        while(i<=mid && j<=high){
	            if(a[i]<a[j]){
	                temp[k++] = a[i++];
	            }else{
	                temp[k++] = a[j++];
	            }
	        }
	        // 把左边剩余的数移入数组 
	        while(i<=mid){
	            temp[k++] = a[i++];
	        }
	        // 把右边边剩余的数移入数组
	        while(j<=high){
	            temp[k++] = a[j++];
	        }
	        // 把新数组中的数覆盖nums数组
	        for(int x=0;x<temp.length;x++){
	            a[x+low] = temp[x];
	        }
	    }
}