module 3

##Load library and record data

library(stats)  ## for dist
#https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/dist

## There are many clustering libraries
#install.packages("NbClust")
library(NbClust)
library(cluster)
library(mclust)

## Package 'mclust' version 5.4.6
## Type 'citation("mclust")' for citing this R package in publications.

library(amap)  ## for Kmeans (notice the cap K)

library(factoextra) ## for cluster vis, silhouette, etc.

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(purrr)

## 
## Attaching package: 'purrr'

## The following object is masked from 'package:mclust':
## 
##     map

#install.packages("stylo")
library(stylo)  ## for dist.cosine

## 
## ### stylo version: 0.7.3 ###
## 
## If you plan to cite this software (please do!), use the following reference:
##     Eder, M., Rybicki, J. and Kestemont, M. (2016). Stylometry with R:
##     a package for computational text analysis. R Journal 8(1): 107-121.
##     <https://journal.r-project.org/archive/2016/RJ-2016-007/index.html>
## 
## To get full BibTeX entry, type: citation("stylo")

#install.packages("philentropy")
library(philentropy)  ## for distance() which offers 46 metrics
## https://cran.r-project.org/web/packages/philentropy/vignettes/Distances.html
library(SnowballC)
library(caTools)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(textstem)

## Loading required package: koRpus.lang.en

## Loading required package: koRpus

## Loading required package: sylly

## For information on available language packages for 'koRpus', run
## 
##   available.koRpus.lang()
## 
## and see ?install.koRpus.lang()

library(stringr)
library(wordcloud)

## Loading required package: RColorBrewer

library(tm) ## to read in corpus (text data)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

## 
## Attaching package: 'tm'

## The following object is masked from 'package:koRpus':
## 
##     readTagged

#Record data

#Data preparation
#Setting working directory
setwd("C:/Users/Haoming Feng/Desktop/501/project")
#Load in data
Record_all<-read.csv("df1.csv")
Record <- Record_all
#Check my data
head(Record)

##      X X2020.9.18 X2020.9.21 X2020.9.22 X2020.9.23 X2020.9.24 X2020.9.25
## 1   li  -3.073415 -0.4242406  1.0869584  -6.388058 -2.6451603   2.931595
## 2 doyu  -1.475106 -2.1221860 -1.7173066  -3.020133  0.2947769  -4.902622
## 3  nio  -1.572005 -1.2598414  4.0584417  -6.555673  3.0600502   1.664813
## 4 huya  -2.468234 -1.7391341  0.3065131  -3.825555  1.5378433  -1.223843
## 5 bili  -3.125701  3.4077323 -0.0692173  -2.719670  1.1103043   1.296210
## 6   iq   1.620678  3.4771168  2.1221298   0.254235  1.5791674  -1.313557
##   X2020.9.28 X2020.9.29 X2020.9.30 X2020.10.1 X2020.10.2 X2020.10.5 X2020.10.6
## 1 -0.6794355  5.9079650 -1.3053430 -5.4054109  4.1230774  0.5740550 -2.0880302
## 2 -3.6576932 -2.6772785 -2.6529084  3.6676702  5.1879667 -6.0976466 -1.4168500
## 3 -0.3707611 11.1407256 -2.3020259  0.3690033  1.6802707 -0.3231750 -4.7904148
## 4 -3.3515223 -4.7427590  0.7148868  4.2153600  4.7990161 -4.8437490 -1.3765188
## 5 -1.9005105 -2.7737921  2.0107889  0.3778037  1.4563069  6.1176435  4.1513474
## 6 -3.0485228  0.9296108 -0.3530447  1.1343814  0.6607912 -0.6369411  0.7203393

str(Record)

## 'data.frame':    6 obs. of  14 variables:
##  $ X         : Factor w/ 6 levels "bili","doyu",..: 5 2 6 3 1 4
##  $ X2020.9.18: num  -3.07 -1.48 -1.57 -2.47 -3.13 ...
##  $ X2020.9.21: num  -0.424 -2.122 -1.26 -1.739 3.408 ...
##  $ X2020.9.22: num  1.087 -1.7173 4.0584 0.3065 -0.0692 ...
##  $ X2020.9.23: num  -6.39 -3.02 -6.56 -3.83 -2.72 ...
##  $ X2020.9.24: num  -2.645 0.295 3.06 1.538 1.11 ...
##  $ X2020.9.25: num  2.93 -4.9 1.66 -1.22 1.3 ...
##  $ X2020.9.28: num  -0.679 -3.658 -0.371 -3.352 -1.901 ...
##  $ X2020.9.29: num  5.91 -2.68 11.14 -4.74 -2.77 ...
##  $ X2020.9.30: num  -1.305 -2.653 -2.302 0.715 2.011 ...
##  $ X2020.10.1: num  -5.405 3.668 0.369 4.215 0.378 ...
##  $ X2020.10.2: num  4.12 5.19 1.68 4.8 1.46 ...
##  $ X2020.10.5: num  0.574 -6.098 -0.323 -4.844 6.118 ...
##  $ X2020.10.6: num  -2.09 -1.42 -4.79 -1.38 4.15 ...

#Save the label
label <- Record$X
#Remove the label
Record <- Record[,-c(1)]
head(Record)

##   X2020.9.18 X2020.9.21 X2020.9.22 X2020.9.23 X2020.9.24 X2020.9.25 X2020.9.28
## 1  -3.073415 -0.4242406  1.0869584  -6.388058 -2.6451603   2.931595 -0.6794355
## 2  -1.475106 -2.1221860 -1.7173066  -3.020133  0.2947769  -4.902622 -3.6576932
## 3  -1.572005 -1.2598414  4.0584417  -6.555673  3.0600502   1.664813 -0.3707611
## 4  -2.468234 -1.7391341  0.3065131  -3.825555  1.5378433  -1.223843 -3.3515223
## 5  -3.125701  3.4077323 -0.0692173  -2.719670  1.1103043   1.296210 -1.9005105
## 6   1.620678  3.4771168  2.1221298   0.254235  1.5791674  -1.313557 -3.0485228
##   X2020.9.29 X2020.9.30 X2020.10.1 X2020.10.2 X2020.10.5 X2020.10.6
## 1  5.9079650 -1.3053430 -5.4054109  4.1230774  0.5740550 -2.0880302
## 2 -2.6772785 -2.6529084  3.6676702  5.1879667 -6.0976466 -1.4168500
## 3 11.1407256 -2.3020259  0.3690033  1.6802707 -0.3231750 -4.7904148
## 4 -4.7427590  0.7148868  4.2153600  4.7990161 -4.8437490 -1.3765188
## 5 -2.7737921  2.0107889  0.3778037  1.4563069  6.1176435  4.1513474
## 6  0.9296108 -0.3530447  1.1343814  0.6607912 -0.6369411  0.7203393

##Check the distance
(Dist1<- dist(Record, method = "minkowski", p=1)) ##Manhattan

##          1        2        3        4        5
## 2 50.63452                                    
## 3 30.80305 53.01346                           
## 4 45.37053 17.09758 49.04916                  
## 5 47.55159 47.46583 51.87571 38.61895         
## 6 47.06358 41.85663 43.58235 35.89778 32.01258

(Dist2<- dist(Record, method = "minkowski", p=2)) #Euclidean

##           1         2         3         4         5
## 2 17.534875                                        
## 3 11.038747 19.161554                              
## 4 17.075426  6.219593 18.976512                    
## 5 15.631304 17.464686 19.898087 14.801747          
## 6 14.619378 12.684626 15.528929 12.082997 11.064903

(DistE<- dist(Record, method = "euclidean")) #same as p = 2

##           1         2         3         4         5
## 2 17.534875                                        
## 3 11.038747 19.161554                              
## 4 17.075426  6.219593 18.976512                    
## 5 15.631304 17.464686 19.898087 14.801747          
## 6 14.619378 12.684626 15.528929 12.082997 11.064903

df <- scale(Record)
#Silhouette method
fviz_nbclust(df, kmeans, method = "silhouette",k.max = 5)+
  labs(subtitle = "Silhouette method")

#Elbow method to determine the optimal number of cluster
fviz_nbclust(df, kmeans, method = "wss",k.max = 5) +
    geom_vline(xintercept = 3, linetype = 2)+
  labs(subtitle = "Elbow method")

#Gap method to determine the optimal number of cluster
set.seed(123)
fviz_nbclust(df, kmeans, nstart = 25,  method = "gap_stat", nboot = 50,k.max = 5)+
  labs(subtitle = "Gap statistic method")

#As a result, the optimal number of cluster is 3

(Record_Norm <- as.data.frame(apply(Record[,1:13],2,function(x)(x-min(x))/(max(x)-min(x)))))

##   X2020.9.18 X2020.9.21 X2020.9.22 X2020.9.23 X2020.9.24 X2020.9.25 X2020.9.28
## 1 0.01101603 0.30324229  0.4855241 0.02461336  0.0000000  1.0000000 0.90609043
## 2 0.34775878 0.00000000  0.0000000 0.51917585  0.5153074  0.0000000 0.00000000
## 3 0.32734336 0.15400928  1.0000000 0.00000000  1.0000000  0.8383014 1.00000000
## 4 0.13851978 0.06841063  0.3503996 0.40090379  0.7331901  0.4695784 0.09314793
## 5 0.00000000 0.98760836  0.2853465 0.56329733  0.6582517  0.7912510 0.53459659
## 6 1.00000000 1.00000000  0.6647513 1.00000000  0.7404333  0.4581269 0.18533099
##   X2020.9.29 X2020.9.30 X2020.10.1 X2020.10.2 X2020.10.5 X2020.10.6
## 1  0.6705534 0.28894786  0.0000000  0.7647784  0.5461763  0.3022206
## 2  0.1300395 0.00000000  0.9430721  1.0000000  0.0000000  0.3772819
## 3  1.0000000 0.07523698  0.6002029  0.2251911  0.4727249  0.0000000
## 4  0.0000000 0.72212987  1.0000000  0.9140854  0.1026498  0.3817923
## 5  0.1239632 1.00000000  0.6011176  0.1757201  1.0000000  1.0000000
## 6  0.3571238 0.49314172  0.6797576  0.0000000  0.4470385  0.6162940

(Dist_norm<- dist(Record_Norm, method = "minkowski", p=2))

##          1        2        3        4        5
## 2 2.096788                                    
## 3 1.524771 2.253341                           
## 4 1.890417 1.010665 1.997703                  
## 5 1.902406 2.301340 2.206530 1.784088         
## 6 2.259156 2.003615 2.044897 1.835237 1.541379

kmeans_Result <- kmeans(Record, 3, nstart=25)
print(kmeans_Result)

## K-means clustering with 3 clusters of sizes 2, 2, 2
## 
## Cluster means:
##   X2020.9.18 X2020.9.21 X2020.9.22 X2020.9.23 X2020.9.24   X2020.9.25
## 1 -2.3227102  -0.842041  2.5727000  -6.471865  0.2074450  2.298203685
## 2 -0.7525115   3.442425  1.0264562  -1.232717  1.3447359 -0.008673721
## 3 -1.9716700  -1.930660 -0.7053968  -3.422844  0.9163101 -3.063232782
##   X2020.9.28 X2020.9.29 X2020.9.30 X2020.10.1 X2020.10.2 X2020.10.5 X2020.10.6
## 1 -0.5250983  8.5243453 -1.8036844 -2.5182038   2.901674   0.125440  -3.439222
## 2 -2.4745167 -0.9220906  0.8288721  0.7560926   1.058549   2.740351   2.435843
## 3 -3.5046077 -3.7100188 -0.9690108  3.9415151   4.993491  -5.470698  -1.396684
## 
## Clustering vector:
## [1] 1 3 1 3 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 60.92697 61.21604 19.34167
##  (between_SS / total_SS =  76.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

kmeans_Result$centers

##   X2020.9.18 X2020.9.21 X2020.9.22 X2020.9.23 X2020.9.24   X2020.9.25
## 1 -2.3227102  -0.842041  2.5727000  -6.471865  0.2074450  2.298203685
## 2 -0.7525115   3.442425  1.0264562  -1.232717  1.3447359 -0.008673721
## 3 -1.9716700  -1.930660 -0.7053968  -3.422844  0.9163101 -3.063232782
##   X2020.9.28 X2020.9.29 X2020.9.30 X2020.10.1 X2020.10.2 X2020.10.5 X2020.10.6
## 1 -0.5250983  8.5243453 -1.8036844 -2.5182038   2.901674   0.125440  -3.439222
## 2 -2.4745167 -0.9220906  0.8288721  0.7560926   1.058549   2.740351   2.435843
## 3 -3.5046077 -3.7100188 -0.9690108  3.9415151   4.993491  -5.470698  -1.396684

#Show the result
kmeans_Result$cluster

## [1] 1 3 1 3 2 2

#Visualize

fviz_cluster(kmeans_Result, Record, main="Euclidean")

#Hierarchical Clustering
(HClust_Ward_Euc_DF <- hclust(Dist2, method = "average" ))

## 
## Call:
## hclust(d = Dist2, method = "average")
## 
## Cluster method   : average 
## Distance         : minkowski 
## Number of objects: 6

plot(HClust_Ward_Euc_DF, cex=0.9, hang=-1, main = "Minkowski p=2 (Euclidean)")
rect.hclust(HClust_Ward_Euc_DF, k=3)

#heatmap
fviz_dist(Dist2, gradient = list(low = "#00AFBB", 
                            mid = "white", high = "#FC4E07"))+
                            ggtitle("Euclidean Heatmap")

## Text Data

corpus <- Corpus(DirSource(directory = "C:/Users/Haoming Feng/Desktop/501/project/corpus"))
getTransformations()

## [1] "removeNumbers"     "removePunctuation" "removeWords"      
## [4] "stemDocument"      "stripWhitespace"

ndocs <- length(corpus)

corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
## Remove all Stop Words
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus_dtm <- DocumentTermMatrix(corpus,control = list(stopwords = TRUE,
                                                       wordLengths = c(3,10),
                                                       removePunctuation = TRUE,
                                                       removeNumbers= TRUE,
                                                       tolower = TRUE))
inspect(corpus_dtm)

## <<DocumentTermMatrix (documents: 9, terms: 55)>>
## Non-/sparse entries: 71/424
## Sparsity           : 86%
## Maximal term length: 10
## Weighting          : term frequency (tf)
## Sample             :
##                Terms
## Docs            aims battery build extesla recycler stock straubel tesla top
##   newseight.txt    0       0     0       0        0     0        0     0   0
##   newsfive.txt     1       1     1       1        1     0        1     0   1
##   newsfour.txt     1       1     1       1        1     0        1     0   1
##   newsnine.txt     0       0     0       0        0     2        0     1   0
##   newsone.txt      0       0     0       0        0     0        0     1   0
##   newsseven.txt    0       0     0       0        0     1        0     1   0
##   newssix.txt      0       0     0       0        0     0        0     1   0
##   newsthree.txt    0       0     0       0        0     1        0     1   0
##   newstwo.txt      0       0     0       0        0     0        0     0   0
##                Terms
## Docs            worlds
##   newseight.txt      0
##   newsfive.txt       1
##   newsfour.txt       1
##   newsnine.txt       0
##   newsone.txt        0
##   newsseven.txt      0
##   newssix.txt        0
##   newsthree.txt      0
##   newstwo.txt        0

corpus_mx <- (as.matrix(corpus_dtm))
word.freq <- sort(colSums(corpus_mx),decreasing = T)
#Wordcloud here
wordcloud(words = names(word.freq),freq = word.freq*2,min.freq = 2)

module 3

Haoming Feng

2020/10/7