天天看點

RNote113---smbinning分箱并輸出到Excel

統計分析

# ------------------------------***統計分析函數***------------------------------
## parameter : x-feature,y-label,data-dataframe,
## 參數:
## ---- x:變量名 
## ---- y:标簽名  
## ---- data:資料框名
## ---- type:
## ---------descriptive_statistics:統計資料,包括缺失值個數等
## ---------fillnull:缺失值填補
## ---------fillpara:缺失值填補的參數
## -----------------newname:填補之後的新特征
## -----------------inplace:是否在原資料框增加這一列
## -----------------fillvalue:缺失值填補項
## ---------confusion_matrix:輸出混淆矩陣
## ---------ispercent:是否輸出%格式的
univariate_analyze_f <-
  function(x,y,data,ispercent = TRUE,
           type = "descriptive_statistics",
           fillpara = list(newname = paste0(x, "_new"),
           inplace = FALSE,fillvalue = -1)) {
    ## 加載需要的包
    tryCatch(
      expr = {library(gmodels)},
      error = function(e) {print("You Need Install 'gmodels' Packages First");return()}
    )
    tryCatch(
      expr = {library(reshape2)},
      error = function(e){print("You Need Install 'reshape2' Packages First");return()}
    )
    ## 缺失值填充的配置參數
    newname <- fillpara$newname
    inplace <- fillpara$inplace
    fillvalue <- fillpara$fillvalue
    if (type == "descriptive_statistics") {
      #make.names = NA 行名自動化,即1,2,3
      descriptive_data <- as.data.frame.array(summary(data[x]),
                                              make.names = NA)
      return(descriptive_data)
    } else if (type == 'fillnull') {
      if (sum(is.na(data[x])) > 0) {
        # 有缺失值
        if (inplace == TRUE) {
          data[newname] <- lapply(data[x], function(x) {ifelse(is.na(x), fillvalue, x)})[[1]]
          return(data)
        } else{
          newdata <- data.frame(newname = lapply(data[x], function(x){ifelse(is.na(x), fillvalue, x)})[[1]])
          colnames(newdata) <- newname
          return(newdata)
        }
      } else{print("No Miss Data")}
    } else if (type == 'confusion_matrix') {
      xx <- gmodels::CrossTable(
        data[, x],
        data[, y],
        prop.chisq = FALSE,
        prop.c = FALSE,
        prop.t = FALSE,
        dnn = c("Feature", "Label")
      )
      dt1 <- reshape2::dcast(as.data.frame(xx$t,stringsAsFactors = FALSE),x~y,fun.aggregate = NULL)
      colnames(dt1) <- c("Feature/Lable",paste0("Freq_",colnames(dt1)[2:ncol(dt1)]))
      dt2 <- reshape2::dcast(as.data.frame(xx$prop.row,stringsAsFactors = FALSE),x~y,fun.aggregate = NULL)
      colnames(dt2) <- c("Feature/Lable",paste0("Rate_",colnames(dt2)[2:ncol(dt2)]))
      dt <- merge(dt1,dt2,by = "Feature/Lable")
      dt3 <- reshape2::dcast(as.data.frame(xx$prop.tbl,stringsAsFactors = FALSE),x~y,fun.aggregate = NULL)
      dt["Row Total"] <- apply(dt1[2:ncol(dt1)], MARGIN = 1, FUN = sum)
      dt["Row_Rate"] <- apply(dt3[2:ncol(dt3)], MARGIN = 1, FUN = sum)
      ## 按照列求和,有點問題。Rate_0+Rate_1=0
      Col_Total_Vector <- c(NA,apply(dt[2:ncol(dt)],MARGIN = 2, FUN = sum))
      names(Col_Total_Vector)[1] <- "Feature/Lable"
      Col_Total_DF <- as.data.frame(matrix(Col_Total_Vector,1,length(Col_Total_Vector),byrow = T),stringsAsFactors = FALSE)
      colnames(Col_Total_DF) <- names(Col_Total_Vector)
      dt <- rbind(dt,Col_Total_DF);dt[nrow(dt),1] <- "Column Total"
      ## 處理 Rate_0、Rate_1
      rate_len <- length(colnames(dt2))
      for (i in colnames(dt2)[2:rate_len]){
        dt[nrow(dt),i] <- dt[nrow(dt),paste0("Freq_",substr(i,6,nchar(i)))]/dt[nrow(dt),"Row Total"]
      }
      for(i in 2:ncol(dt)){dt[i] <- round(dt[i],4)}
      ## 是否輸出百分号形式的資料
      if(ispercent){
        percent_col <- names(which(apply(dt[2:ncol(dt)],2,function(x){any(x>0&x<1)})))
        for(i in percent_col ){dt[i] <- paste0(dt[,i]*100,"%")}
        return(dt)
      }else{return(dt)}
    }
  }
      

分箱

# ------------------------------***分箱函數***------------------------------

## this function can return information value
#### parameter seq_bin is like 10,20,30,so i can split by ","
#### parameter maxcat is the maximum number of categories 
## 參數:
## ---- x:變量名,不要因子,不要因子,不要因子 
## ---- y:标簽名  
## ---- data:資料框名
## ---- seq_bin:分箱間隔
## ---------auto;算法自動分箱
## ---------"10,20":類似這種格式,程式處理為{(-inf,10](10,20](20,+inf]}三組
## ---------type_level:重新分組,輸出結果
## ----maxcat:最大類别型變量個數,預設10,如果x的unique超過10,會報錯
## -----------------newname:填補之後的新特征


univariate_iv_f <- function(x, y, data, seq_bin = "auto",maxcat = 10,type_level = list()) { 
  tryCatch(
    expr = {library(smbinning)},
    error = function(e) {
      print("You Need Install 'smbinning' Packages First")
      install.packages("smbinning");library(smbinning)
    }
  )
  if (is.character(data[, x])|is.factor(data[, x])) {
    ## 類别型變量
    if(length(type_level)==0){
      ## 自動分箱,即不會合并類别
      data[, x] <- as.factor(data[, x])
      return(smbinning.factor(df = data,y = y,
                              x = x, maxcat = maxcat))
    }else{
      ## 按照分類合并資料
      for (i in names(type_level)){assign(i,type_level[[i]])}
      ## 循環完成指派操作
      ## 下面完成分類操作,又要用循環。簡直喪心病狂
      x_new <- c()
      for(j in data[, x] ){
        isin <- unlist(lapply(names(type_level),function(x){j %in% get0(x)}))
        if(sum(isin)==0){
          ## 如果不在list中,統一辨別為"others"
          newname <- "others"
        }else if(sum(isin)>1){
          ## 說明type_level設定有誤,存在交叉項
          print("Error: type_level have repetition ")
          return("Error: type_level have repetition ")
        }else{newname <- names(type_level)[which(isin)]}
        x_new <- c(x_new,newname)
      }
      x_colnew <- paste0(x,"_new")
      data[x_colnew] <- x_new
      data[, x_colnew] <- as.factor(data[, x_colnew])
      return(smbinning.factor(df = data,y = y,
                              x = x_colnew, maxcat = maxcat))
    }

  } else if (is.numeric(data[, x])) {
    ## 數值型變量
    if (seq_bin == "auto") {
      ## 算法自動分箱
      return(smbinning(
        df = data,y = y,
        x = x,p = 0.05
      ))
    } else{
      ## 人為輸入間隔點,分箱
      cuts <- as.numeric(unlist(stringr::str_split(seq_bin, ",")))
      return(smbinning.custom(df = data,y = y,x = x,cuts = cuts))
    }
  }
}

      

畫圖

# ------------------------------***畫圖函數***------------------------------
## 
## 參數:
## ---- x:變量名,不要因子,不要因子,不要因子,type = "iv",x,y有預設值不需要改 
## ---- y:标簽名  
## ---- data:資料框名,要求傳入的data是 smbinning 傳回的結果
## ---- type:兩個類型
## ----------distribution:連續值的密度分布
## ----------iv:分箱之後的資料進行畫圖
## ----------picname:特征,圖檔命名所用
## ---- overduerate:平均逾期率,需要添加在圖檔中

univariate_pic_f <- function(x="GoodRate",y="Cutpoint",picname = "Feature",
                             data,type = "distribution",overduerate = 0.5) {
    tryCatch(
      expr = {library(ggplot2)},
      error = function(e) {
        print("You Need Install 'ggplot2' Packages First")
        install.packages("ggplot2");library(ggplot2)
      }
    )
    if (type == "distribution") {
      if(is.numeric(data[, x])){
        # 如果是數值型,那麼輸出密度分布圖
        ## use ggplot2 generate distribution figure of the density
        ## this is just for continuous variable
        plotdata <- data[, c(x, y)];plotdata[, y] <- as.factor(plotdata[, y])
        colnames(plotdata) <- c("Feature", "Label")
        library(ggplot2)
        pic <- ggplot2::ggplot(data = plotdata, aes(x = Feature,color = Label,fill = Label)) +
          geom_density(alpha = .3)+scale_fill_brewer(type = "seq", palette = "Greens") +
          scale_colour_manual(values = c("red", "yellow")) +
          labs(title = paste0(stringr::str_to_title(x)," Distribution Of Different Groups"))
        return(pic)  
      }else{
        return("Input Must Be Continuous Variable")
      }
    }else if(type == "iv"){
      ## 輸入的是分箱之後的結果,需要畫出分組逾期率和覆寫率,并且添加平均逾期率
      library(ggplot2)
      mytheme <- theme(
        plot.title = element_text(
          size = 15,hjust =0.5, vjust = 1,color="black",
          face = "bold"), #改變标題的位置、顔色、字型大小
        panel.background=element_rect(fill="white",
                                      color="black"),
        panel.grid.major.y=element_line(color="black",
                                        linetype=2),
        panel.grid.minor.y=element_blank(),
        panel.grid.minor.x=element_blank()
      )
      plotdata <- data[1:(nrow(data)-2),1:ncol(data)]
      pic <- ggplot(data = plotdata, aes(x = Cutpoint, y = GoodRate, group = 1)) +
        geom_bar(mapping = aes(y = PctRec),
          position = "dodge",stat = "identity",
          width = 0.4,fill = " orange")+# 設定bar的大小,填充色為綠色
        geom_line(colour = "red",size = 1) +
        geom_point(colour = "red",size = 2,shape = 17) +
        geom_abline(slope = 0,intercept = overduerate,colour = "blue",size = 1) + # 添加水準線辨別總體逾期率
        geom_text(mapping = aes(label = GoodRate),size = 3,colour = 'black'
                  ,vjust = -0.8,hjust = .5,position = position_dodge(0.9))+
        labs(title = paste0(" Overdue Rate of ", picname," Bin"))+
        scale_y_continuous(limits=c(0,1),breaks=seq(from=0,to=1,by=0.05))+mytheme
      return(pic)
    }
}      

儲存圖檔

# ------------------------------***儲存圖檔函數***------------------------------
## 
## 參數:
## ---- pic:圖檔對象
## ---- savepath:儲存位址,預設目錄,需要查驗
## ---- picname:圖檔名稱
save_pic_f <- function(pic,savepath = "./Picture",picname = "Rplot" ){
  tryCatch(
    expr = {library(ggplot2)},
    error = function(e) {
      print("You Need Install 'ggplot2' Packages First")
      install.packages("ggplot2");library(ggplot2)
    }
  )
    ggsave(
      file = paste0(picname, ".png"),
      plot = pic,path = savepath,
      width = 5,height = 4,dpi = 600
    )
}
      

輸出到Excel

# ------------------------------***Excel互動函數***------------------------------
## 
## 參數:
## ---- wb_path:Excel: 存放目錄
## ---- wb_name:Excel: 名稱
## ---- create_sheet: 是否建立sheet
## ---- sheet_name: 建立sheet名稱or加載sheet名稱
## ---- pic_name: 需要儲存的圖檔名稱,預設png格式。此處無需字尾&路徑
## ---- pic_path: 圖檔的存放目錄
## ---- data: 需要儲存的dataframe資料
#### 測試方式
####--1.載入已有資料,建立sheet,并且儲存圖檔和資料
# save2excel(wb_path = "./Docment/",
#            wb_name = "Statistics",
#            create_sheet = TRUE,
#            sheet_name = "test6",
#            pic_path = "./Picture/",
#            pic_name = "Rplot",data = data,
#            save_pic = TRUE,save_data = TRUE)
save2excel <- function(wb_path = "./Document/",
                       wb_name = "Statistics",
                       create_sheet = TRUE,
                       sheet_name = "Sheet",
                       pic_path = "./Picture/",
                       pic_name = "test",data,
                       save_pic = TRUE,save_data = TRUE) {
  package_need <- c('tidyverse', 'rJava', 'xlsxjars', 'xlsx')
  package_install <- sapply(X=package_need, FUN = require, character.only = TRUE)
  if(!all(package_install)){
    ## 有package沒有安裝
    print(paste0("U Must install ",package_need[which(!package_install)]))
  }
  wb_complete_path <- paste0(wb_path,wb_name,".xlsx")
  pic_complete_path <- paste0(pic_path,pic_name,".png")
  # 先加載Excel
  wb <- loadWorkbook(wb_complete_path)
  if(create_sheet){
    # 建立Sheet
    usesheet <- createSheet(wb, sheetName=sheet_name)
  }else{
    # 加載現有Sheet
    usesheet <- eval(parse(text = paste0("getSheets(wb)$",sheet_name)))
  }
  if(save_pic){
    # 添加圖檔
    addPicture(
      file = pic_complete_path,
      sheet = usesheet, # 圖檔儲存的sheet
      startRow = 15,# 在sheet中的位置
      startColumn = 2
    )
  }
  if(save_data){
      ## 設定樣式
      #### cs1 标題行
      cs1 <-
        CellStyle(wb) +
        Alignment(horizontal = "ALIGN_CENTER", 
                  vertical = "VERTICAL_CENTER") + # 對齊方式,水準&豎直居中
        Border(
          color = "black",
          position = c("TOP", "RIGHT" , "LEFT", "BOTTOM"),
          pen = c("BORDER_THIN",
                  "BORDER_THIN",
                  "BORDER_THIN",
                  "BORDER_THIN")) + # 所有邊框加框線&黑色框線
        Font(wb, isItalic = TRUE, isBold = TRUE)+ # 字型加粗、Italic字型
        Fill(
          foregroundColor = "cornflowerblue",
          backgroundColor = "cornflowerblue",
          pattern = "SOLID_FOREGROUND"
        ) #背景色為矢車菊藍
      #### 第一列樣式
      cs2 <-
        CellStyle(wb) + Border(
          color = "black",
          position = c("TOP", "RIGHT" , "LEFT", "BOTTOM"),
          pen = c("BORDER_THIN", "BORDER_THIN", "BORDER_THIN", "BORDER_THIN")
        )+#黑色細框線
        Font(wb, isItalic = FALSE, isBold = TRUE)+
        Fill(
          foregroundColor = "lightblue",
          backgroundColor = "lightblue",
          pattern = "SOLID_FOREGROUND"
        )# 淺藍色填充 
      #### 非首列樣式
      cs3 <- 
        CellStyle(wb) + Border(
          color = "black",
          position = c("TOP", "RIGHT" , "LEFT", "BOTTOM"),
          pen = c("BORDER_THIN", "BORDER_THIN", "BORDER_THIN", "BORDER_THIN")
        )#黑色細框線
      list_name <- paste0("`", 2:ncol(data),"`")
      colStyle_str <- paste("list(`1`=cs2,", paste0(list_name,"=","cs3",collapse = ","),")")
      colStyle <- eval(parse(text = colStyle_str))
      addDataFrame(
        x = data,# 要儲存的資料框
        sheet = usesheet,# 儲存的位置
        col.names = TRUE,# 是否保留列名
        row.names = FALSE,# 是否保留行名
        startRow = 1,#資料儲存的起始位置
        startColumn = 1,#資料儲存的起始位置
        colStyle= colStyle,# 第二、三列藍色
        colnamesStyle = cs1,# 列名的格式
        rownamesStyle = cs1,# 行名的格式
        showNA = FALSE,# 空值是否展示,預設不展示,即保留為空白格
        characterNA = "",# NA展示位空字元串和上面showNA參數有關系
        byrow = FALSE
      )

  }
  saveWorkbook(wb, wb_complete_path)
}
      

建立Excel

# ------------------------------***建立Excel函數***------------------------------
## 
## 參數:
## ---- create_wb 是否建立Excel
## ---- create_sheet 是否建立新Sheet
## ---- wb_path Excel存放目錄
## ---- wb_name Excel名稱
## ---- sheet_name Sheet名稱

create_excel_f <-
  function(create_wb = FALSE,
           create_sheet = TRUE,
           wb_path = "./Docment/",
           wb_name = "Statistics",
           sheet_name = "Sheet") {
    package_need <- c('tidyverse', 'rJava', 'xlsxjars', 'xlsx')
    package_install <- sapply(X=package_need, FUN = require, character.only = TRUE)
    if(!all(package_install)){
      ## 有package沒有安裝
      print(paste0("U Must install ",package_need[which(!package_install)]))
    }
    if(create_wb){
      ##建立Excel
      save_path <- paste0(wb_path,wb_name,".xlsx")
      # 建立Excel和Sheet
      wb <- createWorkbook()  
      if(create_sheet){
        sheet1 <- createSheet(wb, sheet_name)
      }
      #儲存Excel
      saveWorkbook(wb, save_path)
    }
  }