統計分析
# ------------------------------***統計分析函數***------------------------------
## parameter : x-feature,y-label,data-dataframe,
## 參數:
## ---- x:變量名
## ---- y:标簽名
## ---- data:資料框名
## ---- type:
## ---------descriptive_statistics:統計資料,包括缺失值個數等
## ---------fillnull:缺失值填補
## ---------fillpara:缺失值填補的參數
## -----------------newname:填補之後的新特征
## -----------------inplace:是否在原資料框增加這一列
## -----------------fillvalue:缺失值填補項
## ---------confusion_matrix:輸出混淆矩陣
## ---------ispercent:是否輸出%格式的
univariate_analyze_f <-
function(x,y,data,ispercent = TRUE,
type = "descriptive_statistics",
fillpara = list(newname = paste0(x, "_new"),
inplace = FALSE,fillvalue = -1)) {
## 加載需要的包
tryCatch(
expr = {library(gmodels)},
error = function(e) {print("You Need Install 'gmodels' Packages First");return()}
)
tryCatch(
expr = {library(reshape2)},
error = function(e){print("You Need Install 'reshape2' Packages First");return()}
)
## 缺失值填充的配置參數
newname <- fillpara$newname
inplace <- fillpara$inplace
fillvalue <- fillpara$fillvalue
if (type == "descriptive_statistics") {
#make.names = NA 行名自動化,即1,2,3
descriptive_data <- as.data.frame.array(summary(data[x]),
make.names = NA)
return(descriptive_data)
} else if (type == 'fillnull') {
if (sum(is.na(data[x])) > 0) {
# 有缺失值
if (inplace == TRUE) {
data[newname] <- lapply(data[x], function(x) {ifelse(is.na(x), fillvalue, x)})[[1]]
return(data)
} else{
newdata <- data.frame(newname = lapply(data[x], function(x){ifelse(is.na(x), fillvalue, x)})[[1]])
colnames(newdata) <- newname
return(newdata)
}
} else{print("No Miss Data")}
} else if (type == 'confusion_matrix') {
xx <- gmodels::CrossTable(
data[, x],
data[, y],
prop.chisq = FALSE,
prop.c = FALSE,
prop.t = FALSE,
dnn = c("Feature", "Label")
)
dt1 <- reshape2::dcast(as.data.frame(xx$t,stringsAsFactors = FALSE),x~y,fun.aggregate = NULL)
colnames(dt1) <- c("Feature/Lable",paste0("Freq_",colnames(dt1)[2:ncol(dt1)]))
dt2 <- reshape2::dcast(as.data.frame(xx$prop.row,stringsAsFactors = FALSE),x~y,fun.aggregate = NULL)
colnames(dt2) <- c("Feature/Lable",paste0("Rate_",colnames(dt2)[2:ncol(dt2)]))
dt <- merge(dt1,dt2,by = "Feature/Lable")
dt3 <- reshape2::dcast(as.data.frame(xx$prop.tbl,stringsAsFactors = FALSE),x~y,fun.aggregate = NULL)
dt["Row Total"] <- apply(dt1[2:ncol(dt1)], MARGIN = 1, FUN = sum)
dt["Row_Rate"] <- apply(dt3[2:ncol(dt3)], MARGIN = 1, FUN = sum)
## 按照列求和,有點問題。Rate_0+Rate_1=0
Col_Total_Vector <- c(NA,apply(dt[2:ncol(dt)],MARGIN = 2, FUN = sum))
names(Col_Total_Vector)[1] <- "Feature/Lable"
Col_Total_DF <- as.data.frame(matrix(Col_Total_Vector,1,length(Col_Total_Vector),byrow = T),stringsAsFactors = FALSE)
colnames(Col_Total_DF) <- names(Col_Total_Vector)
dt <- rbind(dt,Col_Total_DF);dt[nrow(dt),1] <- "Column Total"
## 處理 Rate_0、Rate_1
rate_len <- length(colnames(dt2))
for (i in colnames(dt2)[2:rate_len]){
dt[nrow(dt),i] <- dt[nrow(dt),paste0("Freq_",substr(i,6,nchar(i)))]/dt[nrow(dt),"Row Total"]
}
for(i in 2:ncol(dt)){dt[i] <- round(dt[i],4)}
## 是否輸出百分号形式的資料
if(ispercent){
percent_col <- names(which(apply(dt[2:ncol(dt)],2,function(x){any(x>0&x<1)})))
for(i in percent_col ){dt[i] <- paste0(dt[,i]*100,"%")}
return(dt)
}else{return(dt)}
}
}
分箱
# ------------------------------***分箱函數***------------------------------
## this function can return information value
#### parameter seq_bin is like 10,20,30,so i can split by ","
#### parameter maxcat is the maximum number of categories
## 參數:
## ---- x:變量名,不要因子,不要因子,不要因子
## ---- y:标簽名
## ---- data:資料框名
## ---- seq_bin:分箱間隔
## ---------auto;算法自動分箱
## ---------"10,20":類似這種格式,程式處理為{(-inf,10](10,20](20,+inf]}三組
## ---------type_level:重新分組,輸出結果
## ----maxcat:最大類别型變量個數,預設10,如果x的unique超過10,會報錯
## -----------------newname:填補之後的新特征
univariate_iv_f <- function(x, y, data, seq_bin = "auto",maxcat = 10,type_level = list()) {
tryCatch(
expr = {library(smbinning)},
error = function(e) {
print("You Need Install 'smbinning' Packages First")
install.packages("smbinning");library(smbinning)
}
)
if (is.character(data[, x])|is.factor(data[, x])) {
## 類别型變量
if(length(type_level)==0){
## 自動分箱,即不會合并類别
data[, x] <- as.factor(data[, x])
return(smbinning.factor(df = data,y = y,
x = x, maxcat = maxcat))
}else{
## 按照分類合并資料
for (i in names(type_level)){assign(i,type_level[[i]])}
## 循環完成指派操作
## 下面完成分類操作,又要用循環。簡直喪心病狂
x_new <- c()
for(j in data[, x] ){
isin <- unlist(lapply(names(type_level),function(x){j %in% get0(x)}))
if(sum(isin)==0){
## 如果不在list中,統一辨別為"others"
newname <- "others"
}else if(sum(isin)>1){
## 說明type_level設定有誤,存在交叉項
print("Error: type_level have repetition ")
return("Error: type_level have repetition ")
}else{newname <- names(type_level)[which(isin)]}
x_new <- c(x_new,newname)
}
x_colnew <- paste0(x,"_new")
data[x_colnew] <- x_new
data[, x_colnew] <- as.factor(data[, x_colnew])
return(smbinning.factor(df = data,y = y,
x = x_colnew, maxcat = maxcat))
}
} else if (is.numeric(data[, x])) {
## 數值型變量
if (seq_bin == "auto") {
## 算法自動分箱
return(smbinning(
df = data,y = y,
x = x,p = 0.05
))
} else{
## 人為輸入間隔點,分箱
cuts <- as.numeric(unlist(stringr::str_split(seq_bin, ",")))
return(smbinning.custom(df = data,y = y,x = x,cuts = cuts))
}
}
}
畫圖
# ------------------------------***畫圖函數***------------------------------
##
## 參數:
## ---- x:變量名,不要因子,不要因子,不要因子,type = "iv",x,y有預設值不需要改
## ---- y:标簽名
## ---- data:資料框名,要求傳入的data是 smbinning 傳回的結果
## ---- type:兩個類型
## ----------distribution:連續值的密度分布
## ----------iv:分箱之後的資料進行畫圖
## ----------picname:特征,圖檔命名所用
## ---- overduerate:平均逾期率,需要添加在圖檔中
univariate_pic_f <- function(x="GoodRate",y="Cutpoint",picname = "Feature",
data,type = "distribution",overduerate = 0.5) {
tryCatch(
expr = {library(ggplot2)},
error = function(e) {
print("You Need Install 'ggplot2' Packages First")
install.packages("ggplot2");library(ggplot2)
}
)
if (type == "distribution") {
if(is.numeric(data[, x])){
# 如果是數值型,那麼輸出密度分布圖
## use ggplot2 generate distribution figure of the density
## this is just for continuous variable
plotdata <- data[, c(x, y)];plotdata[, y] <- as.factor(plotdata[, y])
colnames(plotdata) <- c("Feature", "Label")
library(ggplot2)
pic <- ggplot2::ggplot(data = plotdata, aes(x = Feature,color = Label,fill = Label)) +
geom_density(alpha = .3)+scale_fill_brewer(type = "seq", palette = "Greens") +
scale_colour_manual(values = c("red", "yellow")) +
labs(title = paste0(stringr::str_to_title(x)," Distribution Of Different Groups"))
return(pic)
}else{
return("Input Must Be Continuous Variable")
}
}else if(type == "iv"){
## 輸入的是分箱之後的結果,需要畫出分組逾期率和覆寫率,并且添加平均逾期率
library(ggplot2)
mytheme <- theme(
plot.title = element_text(
size = 15,hjust =0.5, vjust = 1,color="black",
face = "bold"), #改變标題的位置、顔色、字型大小
panel.background=element_rect(fill="white",
color="black"),
panel.grid.major.y=element_line(color="black",
linetype=2),
panel.grid.minor.y=element_blank(),
panel.grid.minor.x=element_blank()
)
plotdata <- data[1:(nrow(data)-2),1:ncol(data)]
pic <- ggplot(data = plotdata, aes(x = Cutpoint, y = GoodRate, group = 1)) +
geom_bar(mapping = aes(y = PctRec),
position = "dodge",stat = "identity",
width = 0.4,fill = " orange")+# 設定bar的大小,填充色為綠色
geom_line(colour = "red",size = 1) +
geom_point(colour = "red",size = 2,shape = 17) +
geom_abline(slope = 0,intercept = overduerate,colour = "blue",size = 1) + # 添加水準線辨別總體逾期率
geom_text(mapping = aes(label = GoodRate),size = 3,colour = 'black'
,vjust = -0.8,hjust = .5,position = position_dodge(0.9))+
labs(title = paste0(" Overdue Rate of ", picname," Bin"))+
scale_y_continuous(limits=c(0,1),breaks=seq(from=0,to=1,by=0.05))+mytheme
return(pic)
}
}
儲存圖檔
# ------------------------------***儲存圖檔函數***------------------------------
##
## 參數:
## ---- pic:圖檔對象
## ---- savepath:儲存位址,預設目錄,需要查驗
## ---- picname:圖檔名稱
save_pic_f <- function(pic,savepath = "./Picture",picname = "Rplot" ){
tryCatch(
expr = {library(ggplot2)},
error = function(e) {
print("You Need Install 'ggplot2' Packages First")
install.packages("ggplot2");library(ggplot2)
}
)
ggsave(
file = paste0(picname, ".png"),
plot = pic,path = savepath,
width = 5,height = 4,dpi = 600
)
}
輸出到Excel
# ------------------------------***Excel互動函數***------------------------------
##
## 參數:
## ---- wb_path:Excel: 存放目錄
## ---- wb_name:Excel: 名稱
## ---- create_sheet: 是否建立sheet
## ---- sheet_name: 建立sheet名稱or加載sheet名稱
## ---- pic_name: 需要儲存的圖檔名稱,預設png格式。此處無需字尾&路徑
## ---- pic_path: 圖檔的存放目錄
## ---- data: 需要儲存的dataframe資料
#### 測試方式
####--1.載入已有資料,建立sheet,并且儲存圖檔和資料
# save2excel(wb_path = "./Docment/",
# wb_name = "Statistics",
# create_sheet = TRUE,
# sheet_name = "test6",
# pic_path = "./Picture/",
# pic_name = "Rplot",data = data,
# save_pic = TRUE,save_data = TRUE)
save2excel <- function(wb_path = "./Document/",
wb_name = "Statistics",
create_sheet = TRUE,
sheet_name = "Sheet",
pic_path = "./Picture/",
pic_name = "test",data,
save_pic = TRUE,save_data = TRUE) {
package_need <- c('tidyverse', 'rJava', 'xlsxjars', 'xlsx')
package_install <- sapply(X=package_need, FUN = require, character.only = TRUE)
if(!all(package_install)){
## 有package沒有安裝
print(paste0("U Must install ",package_need[which(!package_install)]))
}
wb_complete_path <- paste0(wb_path,wb_name,".xlsx")
pic_complete_path <- paste0(pic_path,pic_name,".png")
# 先加載Excel
wb <- loadWorkbook(wb_complete_path)
if(create_sheet){
# 建立Sheet
usesheet <- createSheet(wb, sheetName=sheet_name)
}else{
# 加載現有Sheet
usesheet <- eval(parse(text = paste0("getSheets(wb)$",sheet_name)))
}
if(save_pic){
# 添加圖檔
addPicture(
file = pic_complete_path,
sheet = usesheet, # 圖檔儲存的sheet
startRow = 15,# 在sheet中的位置
startColumn = 2
)
}
if(save_data){
## 設定樣式
#### cs1 标題行
cs1 <-
CellStyle(wb) +
Alignment(horizontal = "ALIGN_CENTER",
vertical = "VERTICAL_CENTER") + # 對齊方式,水準&豎直居中
Border(
color = "black",
position = c("TOP", "RIGHT" , "LEFT", "BOTTOM"),
pen = c("BORDER_THIN",
"BORDER_THIN",
"BORDER_THIN",
"BORDER_THIN")) + # 所有邊框加框線&黑色框線
Font(wb, isItalic = TRUE, isBold = TRUE)+ # 字型加粗、Italic字型
Fill(
foregroundColor = "cornflowerblue",
backgroundColor = "cornflowerblue",
pattern = "SOLID_FOREGROUND"
) #背景色為矢車菊藍
#### 第一列樣式
cs2 <-
CellStyle(wb) + Border(
color = "black",
position = c("TOP", "RIGHT" , "LEFT", "BOTTOM"),
pen = c("BORDER_THIN", "BORDER_THIN", "BORDER_THIN", "BORDER_THIN")
)+#黑色細框線
Font(wb, isItalic = FALSE, isBold = TRUE)+
Fill(
foregroundColor = "lightblue",
backgroundColor = "lightblue",
pattern = "SOLID_FOREGROUND"
)# 淺藍色填充
#### 非首列樣式
cs3 <-
CellStyle(wb) + Border(
color = "black",
position = c("TOP", "RIGHT" , "LEFT", "BOTTOM"),
pen = c("BORDER_THIN", "BORDER_THIN", "BORDER_THIN", "BORDER_THIN")
)#黑色細框線
list_name <- paste0("`", 2:ncol(data),"`")
colStyle_str <- paste("list(`1`=cs2,", paste0(list_name,"=","cs3",collapse = ","),")")
colStyle <- eval(parse(text = colStyle_str))
addDataFrame(
x = data,# 要儲存的資料框
sheet = usesheet,# 儲存的位置
col.names = TRUE,# 是否保留列名
row.names = FALSE,# 是否保留行名
startRow = 1,#資料儲存的起始位置
startColumn = 1,#資料儲存的起始位置
colStyle= colStyle,# 第二、三列藍色
colnamesStyle = cs1,# 列名的格式
rownamesStyle = cs1,# 行名的格式
showNA = FALSE,# 空值是否展示,預設不展示,即保留為空白格
characterNA = "",# NA展示位空字元串和上面showNA參數有關系
byrow = FALSE
)
}
saveWorkbook(wb, wb_complete_path)
}
建立Excel
# ------------------------------***建立Excel函數***------------------------------
##
## 參數:
## ---- create_wb 是否建立Excel
## ---- create_sheet 是否建立新Sheet
## ---- wb_path Excel存放目錄
## ---- wb_name Excel名稱
## ---- sheet_name Sheet名稱
create_excel_f <-
function(create_wb = FALSE,
create_sheet = TRUE,
wb_path = "./Docment/",
wb_name = "Statistics",
sheet_name = "Sheet") {
package_need <- c('tidyverse', 'rJava', 'xlsxjars', 'xlsx')
package_install <- sapply(X=package_need, FUN = require, character.only = TRUE)
if(!all(package_install)){
## 有package沒有安裝
print(paste0("U Must install ",package_need[which(!package_install)]))
}
if(create_wb){
##建立Excel
save_path <- paste0(wb_path,wb_name,".xlsx")
# 建立Excel和Sheet
wb <- createWorkbook()
if(create_sheet){
sheet1 <- createSheet(wb, sheet_name)
}
#儲存Excel
saveWorkbook(wb, save_path)
}
}