数据可视化有意思的小例子：Taylor Swift 歌词数据分析和可视化——第二部分

接着重复这篇文章

Data Visualization and Analysis of Taylor Swift’s Song Lyrics

情感分析

情感分析是啥暂时不太关注，主要关注文章里的数据可视化部分，按照文章中的代码准备数据

lyrics<-read.csv("taylor_swift_lyrics_1.csv",header=T)
lyrics_text <- lyrics$lyric
lyrics_text<- gsub('[[:punct:]]+', '', lyrics_text)
lyrics_text<- gsub("([[:alpha:]])\1+", "", lyrics_text)
library(syuzhet)
help(package="syuzhet")
ty_sentiment <- get_nrc_sentiment((lyrics_text))

复制

gsub

函数是用来替换字符的，基本的用法是

> gsub("A","a","AAAbbbccc")
[1] "aaabbbccc"

复制

第一个位置是要替换的字符，第二个位置是替换成啥，第三个位置是完整的字符串。

第一个位置应该是可以用正则表达式的，但是R语言的正则表达式自己还没有掌握

所以下面两行代码

lyrics_text<- gsub('[[:punct:]]+', '', lyrics_text)
lyrics_text<- gsub("([[:alpha:]])\1+", "", lyrics_text)

复制

干了啥自己还没看明白，原文的英文注释是

removing punctations and alphanumeric content

get_nrc_sentiment()

应该是做情感分析的函数吧？

暂时先不管他了

将

ty_sentiment <- get_nrc_sentiment((lyrics_text))

运行完得到了一个数据框

> head(ty_sentiment)
  anger anticipation disgust fear joy sadness surprise trust
1     0            0       0    0   0       1        0     0
2     0            0       1    1   0       1        0     0
3     1            0       1    0   0       1        0     0
4     0            0       1    0   0       0        0     1
5     0            0       0    0   0       0        0     0
6     0            0       0    0   0       0        0     0
  negative positive
1        0        0
2        1        0
3        1        0
4        1        0
5        0        0
6        0        0

复制

构造接下来作图用到的数据集

sentimentscores<-data.frame(colSums(ty_sentiment[,]))
head(sentimentscores)
names(sentimentscores) <- "Score"
sentimentscores <- cbind("sentiment"=rownames(sentimentscores),sentimentscores)
rownames(sentimentscores) <- NULL

复制

最终的数据集长这个样子

> sentimentscores
      sentiment Score
1         anger   538
2  anticipation   760
3       disgust   427
4          fear   637
5           joy   956
6       sadness   684
7      surprise   429
8         trust   665
9      negative  1086
10     positive  1335

复制

ggplot2基本的柱形图

library(ggplot2)
ggplot(data=sentimentscores,aes(x=sentiment,y=Score))+
  geom_bar(aes(fill=sentiment),stat = "identity")+
  theme(legend.position="none")+
  xlab("Sentiments")+ylab("Scores")+
  ggtitle("Total sentiment based on scores")+
  theme_minimal()

复制

数据可视化有意思的小例子：Taylor Swift 歌词数据分析和可视化——第二部分

image.png

lyrics$lyric <- as.character(lyrics$lyric)
tidy_lyrics <- lyrics %>% 
unnest_tokens(word,lyric)
song_wrd_count <- tidy_lyrics %>% count(track_title)
lyric_counts <- tidy_lyrics %>%
left_join(song_wrd_count, by = "track_title") %>% 
rename(total_words=n)
lyric_sentiment <- tidy_lyrics %>% 
inner_join(get_sentiments("nrc"),by="word")

复制

重复到这的时候遇到了报错

所以下面自己构造数据集，可视化的结果就偏离真实情况了

lyric_counts$sentiment<-sample(colnames(ty_sentiment),
                               dim(lyric_counts)[1],
                               replace = T)
df<-lyric_counts%>%
  count(word,sentiment,sort=T)%>%
  group_by(sentiment)%>%
  top_n(n=10)%>%
  ungroup()
head(df)
head(lyric_counts)

复制

作图

library(ggplot2)
png("1.png",height = 800,width = 1000)
ggplot(df,aes(x=reorder(word,n),y=n,fill=sentiment))+
  geom_col(show.legend = F)+
  facet_wrap(~sentiment,scales = "free")+
  coord_flip()+
  theme_bw()+
  xlab("Sentiments") + ylab("Scores")+
  ggtitle("Top words used to express emotions and sentiments")
dev.off()

复制

image.png

第二幅图

df1<-lyric_counts%>%
  count(track_title,sentiment,sort = T)%>%
  group_by(sentiment)%>%
  top_n(n=5)
png(file="2.png",width=1300,height=700)
ggplot(df1,aes(x=reorder(track_title,n),y=n,fill=sentiment))+
  geom_bar(stat="identity",show.legend = F)+
  facet_wrap(~sentiment,scales = "free")+
  xlab("Sentiments") + ylab("Scores")+
  ggtitle("Top songs associated with emotions and sentiments") +
  coord_flip()+
  theme_bw()
dev.off()

复制

image.png

接下来是最想重复的一幅图

year_emotion<-lyric_counts%>%
  count(sentiment,year)%>%
  group_by(year,sentiment)%>%
  summarise(sentiment_sum=sum(n))%>%
  ungroup()
head(year_emotion)
year_emotion<-na.omit(year_emotion)
grid.col = c("2006" = "#E69F00", "2008" = "#56B4E9", 
             "2010" = "#009E73", "2012" = "#CC79A7", 
             "2014" = "#D55E00", "2017" = "#00D6C9", 
             "anger" = "grey", "anticipation" = "grey", 
             "disgust" = "grey", "fear" = "grey", 
             "joy" = "grey", "sadness" = "grey", 
             "surprise" = "grey", "trust" = "grey")
circos.par(gap.after = c(rep(6, length(unique(year_emotion[[1]])) - 1), 15,
                         rep(6, length(unique(year_emotion[[2]])) - 1), 15))

chordDiagram(year_emotion, grid.col = grid.col, transparency = .2)
title("Relationship between emotion and song's year of release")
circos.clear()

复制

image.png

重复过程中遇到很多dplyr包中的函数，都是第一次使用，抽时间在回过头来看这些函数的用法！