##Load library and record data
library(stats) ## for dist
#https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/dist
## There are many clustering libraries
#install.packages("NbClust")
library(NbClust)
library(cluster)
library(mclust)
## Package 'mclust' version 5.4.6
## Type 'citation("mclust")' for citing this R package in publications.
library(amap) ## for Kmeans (notice the cap K)
library(factoextra) ## for cluster vis, silhouette, etc.
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(purrr)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:mclust':
##
## map
#install.packages("stylo")
library(stylo) ## for dist.cosine
##
## ### stylo version: 0.7.3 ###
##
## If you plan to cite this software (please do!), use the following reference:
## Eder, M., Rybicki, J. and Kestemont, M. (2016). Stylometry with R:
## a package for computational text analysis. R Journal 8(1): 107-121.
## <https://journal.r-project.org/archive/2016/RJ-2016-007/index.html>
##
## To get full BibTeX entry, type: citation("stylo")
#install.packages("philentropy")
library(philentropy) ## for distance() which offers 46 metrics
## https://cran.r-project.org/web/packages/philentropy/vignettes/Distances.html
library(SnowballC)
library(caTools)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(textstem)
## Loading required package: koRpus.lang.en
## Loading required package: koRpus
## Loading required package: sylly
## For information on available language packages for 'koRpus', run
##
## available.koRpus.lang()
##
## and see ?install.koRpus.lang()
library(stringr)
library(wordcloud)
## Loading required package: RColorBrewer
library(tm) ## to read in corpus (text data)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
##
## Attaching package: 'tm'
## The following object is masked from 'package:koRpus':
##
## readTagged
#Record data
#Data preparation
#Setting working directory
setwd("C:/Users/Haoming Feng/Desktop/501/project")
#Load in data
Record_all<-read.csv("df1.csv")
Record <- Record_all
#Check my data
head(Record)
## X X2020.9.18 X2020.9.21 X2020.9.22 X2020.9.23 X2020.9.24 X2020.9.25
## 1 li -3.073415 -0.4242406 1.0869584 -6.388058 -2.6451603 2.931595
## 2 doyu -1.475106 -2.1221860 -1.7173066 -3.020133 0.2947769 -4.902622
## 3 nio -1.572005 -1.2598414 4.0584417 -6.555673 3.0600502 1.664813
## 4 huya -2.468234 -1.7391341 0.3065131 -3.825555 1.5378433 -1.223843
## 5 bili -3.125701 3.4077323 -0.0692173 -2.719670 1.1103043 1.296210
## 6 iq 1.620678 3.4771168 2.1221298 0.254235 1.5791674 -1.313557
## X2020.9.28 X2020.9.29 X2020.9.30 X2020.10.1 X2020.10.2 X2020.10.5 X2020.10.6
## 1 -0.6794355 5.9079650 -1.3053430 -5.4054109 4.1230774 0.5740550 -2.0880302
## 2 -3.6576932 -2.6772785 -2.6529084 3.6676702 5.1879667 -6.0976466 -1.4168500
## 3 -0.3707611 11.1407256 -2.3020259 0.3690033 1.6802707 -0.3231750 -4.7904148
## 4 -3.3515223 -4.7427590 0.7148868 4.2153600 4.7990161 -4.8437490 -1.3765188
## 5 -1.9005105 -2.7737921 2.0107889 0.3778037 1.4563069 6.1176435 4.1513474
## 6 -3.0485228 0.9296108 -0.3530447 1.1343814 0.6607912 -0.6369411 0.7203393
str(Record)
## 'data.frame': 6 obs. of 14 variables:
## $ X : Factor w/ 6 levels "bili","doyu",..: 5 2 6 3 1 4
## $ X2020.9.18: num -3.07 -1.48 -1.57 -2.47 -3.13 ...
## $ X2020.9.21: num -0.424 -2.122 -1.26 -1.739 3.408 ...
## $ X2020.9.22: num 1.087 -1.7173 4.0584 0.3065 -0.0692 ...
## $ X2020.9.23: num -6.39 -3.02 -6.56 -3.83 -2.72 ...
## $ X2020.9.24: num -2.645 0.295 3.06 1.538 1.11 ...
## $ X2020.9.25: num 2.93 -4.9 1.66 -1.22 1.3 ...
## $ X2020.9.28: num -0.679 -3.658 -0.371 -3.352 -1.901 ...
## $ X2020.9.29: num 5.91 -2.68 11.14 -4.74 -2.77 ...
## $ X2020.9.30: num -1.305 -2.653 -2.302 0.715 2.011 ...
## $ X2020.10.1: num -5.405 3.668 0.369 4.215 0.378 ...
## $ X2020.10.2: num 4.12 5.19 1.68 4.8 1.46 ...
## $ X2020.10.5: num 0.574 -6.098 -0.323 -4.844 6.118 ...
## $ X2020.10.6: num -2.09 -1.42 -4.79 -1.38 4.15 ...
#Save the label
label <- Record$X
#Remove the label
Record <- Record[,-c(1)]
head(Record)
## X2020.9.18 X2020.9.21 X2020.9.22 X2020.9.23 X2020.9.24 X2020.9.25 X2020.9.28
## 1 -3.073415 -0.4242406 1.0869584 -6.388058 -2.6451603 2.931595 -0.6794355
## 2 -1.475106 -2.1221860 -1.7173066 -3.020133 0.2947769 -4.902622 -3.6576932
## 3 -1.572005 -1.2598414 4.0584417 -6.555673 3.0600502 1.664813 -0.3707611
## 4 -2.468234 -1.7391341 0.3065131 -3.825555 1.5378433 -1.223843 -3.3515223
## 5 -3.125701 3.4077323 -0.0692173 -2.719670 1.1103043 1.296210 -1.9005105
## 6 1.620678 3.4771168 2.1221298 0.254235 1.5791674 -1.313557 -3.0485228
## X2020.9.29 X2020.9.30 X2020.10.1 X2020.10.2 X2020.10.5 X2020.10.6
## 1 5.9079650 -1.3053430 -5.4054109 4.1230774 0.5740550 -2.0880302
## 2 -2.6772785 -2.6529084 3.6676702 5.1879667 -6.0976466 -1.4168500
## 3 11.1407256 -2.3020259 0.3690033 1.6802707 -0.3231750 -4.7904148
## 4 -4.7427590 0.7148868 4.2153600 4.7990161 -4.8437490 -1.3765188
## 5 -2.7737921 2.0107889 0.3778037 1.4563069 6.1176435 4.1513474
## 6 0.9296108 -0.3530447 1.1343814 0.6607912 -0.6369411 0.7203393
##Check the distance
(Dist1<- dist(Record, method = "minkowski", p=1)) ##Manhattan
## 1 2 3 4 5
## 2 50.63452
## 3 30.80305 53.01346
## 4 45.37053 17.09758 49.04916
## 5 47.55159 47.46583 51.87571 38.61895
## 6 47.06358 41.85663 43.58235 35.89778 32.01258
(Dist2<- dist(Record, method = "minkowski", p=2)) #Euclidean
## 1 2 3 4 5
## 2 17.534875
## 3 11.038747 19.161554
## 4 17.075426 6.219593 18.976512
## 5 15.631304 17.464686 19.898087 14.801747
## 6 14.619378 12.684626 15.528929 12.082997 11.064903
(DistE<- dist(Record, method = "euclidean")) #same as p = 2
## 1 2 3 4 5
## 2 17.534875
## 3 11.038747 19.161554
## 4 17.075426 6.219593 18.976512
## 5 15.631304 17.464686 19.898087 14.801747
## 6 14.619378 12.684626 15.528929 12.082997 11.064903
df <- scale(Record)
#Silhouette method
fviz_nbclust(df, kmeans, method = "silhouette",k.max = 5)+
labs(subtitle = "Silhouette method")
#Elbow method to determine the optimal number of cluster
fviz_nbclust(df, kmeans, method = "wss",k.max = 5) +
geom_vline(xintercept = 3, linetype = 2)+
labs(subtitle = "Elbow method")
#Gap method to determine the optimal number of cluster
set.seed(123)
fviz_nbclust(df, kmeans, nstart = 25, method = "gap_stat", nboot = 50,k.max = 5)+
labs(subtitle = "Gap statistic method")
#As a result, the optimal number of cluster is 3
(Record_Norm <- as.data.frame(apply(Record[,1:13],2,function(x)(x-min(x))/(max(x)-min(x)))))
## X2020.9.18 X2020.9.21 X2020.9.22 X2020.9.23 X2020.9.24 X2020.9.25 X2020.9.28
## 1 0.01101603 0.30324229 0.4855241 0.02461336 0.0000000 1.0000000 0.90609043
## 2 0.34775878 0.00000000 0.0000000 0.51917585 0.5153074 0.0000000 0.00000000
## 3 0.32734336 0.15400928 1.0000000 0.00000000 1.0000000 0.8383014 1.00000000
## 4 0.13851978 0.06841063 0.3503996 0.40090379 0.7331901 0.4695784 0.09314793
## 5 0.00000000 0.98760836 0.2853465 0.56329733 0.6582517 0.7912510 0.53459659
## 6 1.00000000 1.00000000 0.6647513 1.00000000 0.7404333 0.4581269 0.18533099
## X2020.9.29 X2020.9.30 X2020.10.1 X2020.10.2 X2020.10.5 X2020.10.6
## 1 0.6705534 0.28894786 0.0000000 0.7647784 0.5461763 0.3022206
## 2 0.1300395 0.00000000 0.9430721 1.0000000 0.0000000 0.3772819
## 3 1.0000000 0.07523698 0.6002029 0.2251911 0.4727249 0.0000000
## 4 0.0000000 0.72212987 1.0000000 0.9140854 0.1026498 0.3817923
## 5 0.1239632 1.00000000 0.6011176 0.1757201 1.0000000 1.0000000
## 6 0.3571238 0.49314172 0.6797576 0.0000000 0.4470385 0.6162940
(Dist_norm<- dist(Record_Norm, method = "minkowski", p=2))
## 1 2 3 4 5
## 2 2.096788
## 3 1.524771 2.253341
## 4 1.890417 1.010665 1.997703
## 5 1.902406 2.301340 2.206530 1.784088
## 6 2.259156 2.003615 2.044897 1.835237 1.541379
kmeans_Result <- kmeans(Record, 3, nstart=25)
print(kmeans_Result)
## K-means clustering with 3 clusters of sizes 2, 2, 2
##
## Cluster means:
## X2020.9.18 X2020.9.21 X2020.9.22 X2020.9.23 X2020.9.24 X2020.9.25
## 1 -2.3227102 -0.842041 2.5727000 -6.471865 0.2074450 2.298203685
## 2 -0.7525115 3.442425 1.0264562 -1.232717 1.3447359 -0.008673721
## 3 -1.9716700 -1.930660 -0.7053968 -3.422844 0.9163101 -3.063232782
## X2020.9.28 X2020.9.29 X2020.9.30 X2020.10.1 X2020.10.2 X2020.10.5 X2020.10.6
## 1 -0.5250983 8.5243453 -1.8036844 -2.5182038 2.901674 0.125440 -3.439222
## 2 -2.4745167 -0.9220906 0.8288721 0.7560926 1.058549 2.740351 2.435843
## 3 -3.5046077 -3.7100188 -0.9690108 3.9415151 4.993491 -5.470698 -1.396684
##
## Clustering vector:
## [1] 1 3 1 3 2 2
##
## Within cluster sum of squares by cluster:
## [1] 60.92697 61.21604 19.34167
## (between_SS / total_SS = 76.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
kmeans_Result$centers
## X2020.9.18 X2020.9.21 X2020.9.22 X2020.9.23 X2020.9.24 X2020.9.25
## 1 -2.3227102 -0.842041 2.5727000 -6.471865 0.2074450 2.298203685
## 2 -0.7525115 3.442425 1.0264562 -1.232717 1.3447359 -0.008673721
## 3 -1.9716700 -1.930660 -0.7053968 -3.422844 0.9163101 -3.063232782
## X2020.9.28 X2020.9.29 X2020.9.30 X2020.10.1 X2020.10.2 X2020.10.5 X2020.10.6
## 1 -0.5250983 8.5243453 -1.8036844 -2.5182038 2.901674 0.125440 -3.439222
## 2 -2.4745167 -0.9220906 0.8288721 0.7560926 1.058549 2.740351 2.435843
## 3 -3.5046077 -3.7100188 -0.9690108 3.9415151 4.993491 -5.470698 -1.396684
#Show the result
kmeans_Result$cluster
## [1] 1 3 1 3 2 2
#Visualize
fviz_cluster(kmeans_Result, Record, main="Euclidean")
#Hierarchical Clustering
(HClust_Ward_Euc_DF <- hclust(Dist2, method = "average" ))
##
## Call:
## hclust(d = Dist2, method = "average")
##
## Cluster method : average
## Distance : minkowski
## Number of objects: 6
plot(HClust_Ward_Euc_DF, cex=0.9, hang=-1, main = "Minkowski p=2 (Euclidean)")
rect.hclust(HClust_Ward_Euc_DF, k=3)
#heatmap
fviz_dist(Dist2, gradient = list(low = "#00AFBB",
mid = "white", high = "#FC4E07"))+
ggtitle("Euclidean Heatmap")
## Text Data
corpus <- Corpus(DirSource(directory = "C:/Users/Haoming Feng/Desktop/501/project/corpus"))
getTransformations()
## [1] "removeNumbers" "removePunctuation" "removeWords"
## [4] "stemDocument" "stripWhitespace"
ndocs <- length(corpus)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
## Remove all Stop Words
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus_dtm <- DocumentTermMatrix(corpus,control = list(stopwords = TRUE,
wordLengths = c(3,10),
removePunctuation = TRUE,
removeNumbers= TRUE,
tolower = TRUE))
inspect(corpus_dtm)
## <<DocumentTermMatrix (documents: 9, terms: 55)>>
## Non-/sparse entries: 71/424
## Sparsity : 86%
## Maximal term length: 10
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs aims battery build extesla recycler stock straubel tesla top
## newseight.txt 0 0 0 0 0 0 0 0 0
## newsfive.txt 1 1 1 1 1 0 1 0 1
## newsfour.txt 1 1 1 1 1 0 1 0 1
## newsnine.txt 0 0 0 0 0 2 0 1 0
## newsone.txt 0 0 0 0 0 0 0 1 0
## newsseven.txt 0 0 0 0 0 1 0 1 0
## newssix.txt 0 0 0 0 0 0 0 1 0
## newsthree.txt 0 0 0 0 0 1 0 1 0
## newstwo.txt 0 0 0 0 0 0 0 0 0
## Terms
## Docs worlds
## newseight.txt 0
## newsfive.txt 1
## newsfour.txt 1
## newsnine.txt 0
## newsone.txt 0
## newsseven.txt 0
## newssix.txt 0
## newsthree.txt 0
## newstwo.txt 0
corpus_mx <- (as.matrix(corpus_dtm))
word.freq <- sort(colSums(corpus_mx),decreasing = T)
#Wordcloud here
wordcloud(words = names(word.freq),freq = word.freq*2,min.freq = 2)