接着重复这篇文章
Data Visualization and Analysis of Taylor Swift’s Song Lyrics
情感分析
情感分析是啥暂时不太关注,主要关注文章里的数据可视化部分,按照文章中的代码准备数据
lyrics<-read.csv("taylor_swift_lyrics_1.csv",header=T)
lyrics_text <- lyrics$lyric
lyrics_text<- gsub('[[:punct:]]+', '', lyrics_text)
lyrics_text<- gsub("([[:alpha:]])\1+", "", lyrics_text)
library(syuzhet)
help(package="syuzhet")
ty_sentiment <- get_nrc_sentiment((lyrics_text))
复制
gsub
函数是用来替换字符的,基本的用法是
> gsub("A","a","AAAbbbccc")
[1] "aaabbbccc"
复制
第一个位置是要替换的字符,第二个位置是替换成啥,第三个位置是完整的字符串。
第一个位置应该是可以用正则表达式的,但是R语言的正则表达式自己还没有掌握
所以下面两行代码
lyrics_text<- gsub('[[:punct:]]+', '', lyrics_text)
lyrics_text<- gsub("([[:alpha:]])\1+", "", lyrics_text)
复制
干了啥自己还没看明白,原文的英文注释是
removing punctations and alphanumeric content
get_nrc_sentiment()
应该是做情感分析的函数吧?
暂时先不管他了
将
ty_sentiment <- get_nrc_sentiment((lyrics_text))
运行完得到了一个数据框
> head(ty_sentiment)
anger anticipation disgust fear joy sadness surprise trust
1 0 0 0 0 0 1 0 0
2 0 0 1 1 0 1 0 0
3 1 0 1 0 0 1 0 0
4 0 0 1 0 0 0 0 1
5 0 0 0 0 0 0 0 0
6 0 0 0 0 0 0 0 0
negative positive
1 0 0
2 1 0
3 1 0
4 1 0
5 0 0
6 0 0
复制
构造接下来作图用到的数据集
sentimentscores<-data.frame(colSums(ty_sentiment[,]))
head(sentimentscores)
names(sentimentscores) <- "Score"
sentimentscores <- cbind("sentiment"=rownames(sentimentscores),sentimentscores)
rownames(sentimentscores) <- NULL
复制
最终的数据集长这个样子
> sentimentscores
sentiment Score
1 anger 538
2 anticipation 760
3 disgust 427
4 fear 637
5 joy 956
6 sadness 684
7 surprise 429
8 trust 665
9 negative 1086
10 positive 1335
复制
ggplot2基本的柱形图
library(ggplot2)
ggplot(data=sentimentscores,aes(x=sentiment,y=Score))+
geom_bar(aes(fill=sentiment),stat = "identity")+
theme(legend.position="none")+
xlab("Sentiments")+ylab("Scores")+
ggtitle("Total sentiment based on scores")+
theme_minimal()
复制

image.png
lyrics$lyric <- as.character(lyrics$lyric)
tidy_lyrics <- lyrics %>%
unnest_tokens(word,lyric)
song_wrd_count <- tidy_lyrics %>% count(track_title)
lyric_counts <- tidy_lyrics %>%
left_join(song_wrd_count, by = "track_title") %>%
rename(total_words=n)
lyric_sentiment <- tidy_lyrics %>%
inner_join(get_sentiments("nrc"),by="word")
复制
重复到这的时候遇到了报错
所以下面自己构造数据集,可视化的结果就偏离真实情况了
lyric_counts$sentiment<-sample(colnames(ty_sentiment),
dim(lyric_counts)[1],
replace = T)
df<-lyric_counts%>%
count(word,sentiment,sort=T)%>%
group_by(sentiment)%>%
top_n(n=10)%>%
ungroup()
head(df)
head(lyric_counts)
复制
作图
library(ggplot2)
png("1.png",height = 800,width = 1000)
ggplot(df,aes(x=reorder(word,n),y=n,fill=sentiment))+
geom_col(show.legend = F)+
facet_wrap(~sentiment,scales = "free")+
coord_flip()+
theme_bw()+
xlab("Sentiments") + ylab("Scores")+
ggtitle("Top words used to express emotions and sentiments")
dev.off()
复制
image.png
第二幅图
df1<-lyric_counts%>%
count(track_title,sentiment,sort = T)%>%
group_by(sentiment)%>%
top_n(n=5)
png(file="2.png",width=1300,height=700)
ggplot(df1,aes(x=reorder(track_title,n),y=n,fill=sentiment))+
geom_bar(stat="identity",show.legend = F)+
facet_wrap(~sentiment,scales = "free")+
xlab("Sentiments") + ylab("Scores")+
ggtitle("Top songs associated with emotions and sentiments") +
coord_flip()+
theme_bw()
dev.off()
复制
image.png
接下来是最想重复的一幅图
year_emotion<-lyric_counts%>%
count(sentiment,year)%>%
group_by(year,sentiment)%>%
summarise(sentiment_sum=sum(n))%>%
ungroup()
head(year_emotion)
year_emotion<-na.omit(year_emotion)
grid.col = c("2006" = "#E69F00", "2008" = "#56B4E9",
"2010" = "#009E73", "2012" = "#CC79A7",
"2014" = "#D55E00", "2017" = "#00D6C9",
"anger" = "grey", "anticipation" = "grey",
"disgust" = "grey", "fear" = "grey",
"joy" = "grey", "sadness" = "grey",
"surprise" = "grey", "trust" = "grey")
circos.par(gap.after = c(rep(6, length(unique(year_emotion[[1]])) - 1), 15,
rep(6, length(unique(year_emotion[[2]])) - 1), 15))
chordDiagram(year_emotion, grid.col = grid.col, transparency = .2)
title("Relationship between emotion and song's year of release")
circos.clear()
复制
image.png
重复过程中遇到很多dplyr包中的函数,都是第一次使用,抽时间在回过头来看这些函数的用法!