library(viridis)
## Loading required package: viridisLite
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(TSP)
library(data.table)
#library(ggplot2)
#library(Matrix)
library(tcltk)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:arules':
##
## intersect, recode, setdiff, setequal, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(devtools)
## Loading required package: usethis
library(purrr)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:data.table':
##
## transpose
library(tidyr)
##
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
##
## expand, pack, unpack
library(arulesViz)
## Loading required package: grid
library(networkD3)
Data Preparation
stocks <- read.transactions("stocks.csv",rm.duplicates = FALSE,format = "basket",sep = ",",cols = NULL)
inspect(stocks)
## items
## [1] {Apple,NIO,Tesla}
## [2] {Doyu,Huya,NIO,Tesla}
## [3] {Apple,Doyu,Huya,Nvidia}
## [4] {Doyu,Huya,Iqiyi}
## [5] {Huya,Iqiyi}
## [6] {Alibaba,Tesla,Xpev}
## [7] {Bilibili,Huya,Iqiyi}
## [8] {Bilibili,Huya,Iqiyi}
## [9] {Apple,Jd,Pdd,Tesla}
## [10] {Apple,NIO,Tesla,Xpev}
## [11] {JD,PDD}
## [12] {Alibaba,NIO,Xpev}
## [13] {NIO,Tsla,Xpev}
## [14] {Amd,Apple,Nvdia}
## [15] {Amd,Nvdia,Xpev}
## [16] {Jd,NIO,Xpev}
## [17] {Apple,Bilibili,Jd}
## [18] {Bilibili,Doyu,Huya}
## [19] {AMD,Apple,Tesla}
## [20] {Nvidia,Tesla,Xpev}
Association Rule Mining
stock_rule = arules::apriori(stocks,parameter = list(support=.1,confidence= .5,minlen = 2))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.1 2
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 2
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[18 item(s), 20 transaction(s)] done [0.00s].
## sorting and recoding items ... [13 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [22 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
inspect(stock_rule)
## lhs rhs support confidence coverage lift count
## [1] {Nvdia} => {Amd} 0.10 1.0000000 0.10 10.000000 2
## [2] {Amd} => {Nvdia} 0.10 1.0000000 0.10 10.000000 2
## [3] {Alibaba} => {Xpev} 0.10 1.0000000 0.10 2.857143 2
## [4] {Jd} => {Apple} 0.10 0.6666667 0.15 1.904762 2
## [5] {Iqiyi} => {Bilibili} 0.10 0.5000000 0.20 2.500000 2
## [6] {Bilibili} => {Iqiyi} 0.10 0.5000000 0.20 2.500000 2
## [7] {Iqiyi} => {Huya} 0.20 1.0000000 0.20 2.857143 4
## [8] {Huya} => {Iqiyi} 0.20 0.5714286 0.35 2.857143 4
## [9] {Bilibili} => {Huya} 0.15 0.7500000 0.20 2.142857 3
## [10] {Doyu} => {Huya} 0.20 1.0000000 0.20 2.857143 4
## [11] {Huya} => {Doyu} 0.20 0.5714286 0.35 2.857143 4
## [12] {NIO} => {Xpev} 0.20 0.6666667 0.30 1.904762 4
## [13] {Xpev} => {NIO} 0.20 0.5714286 0.35 1.904762 4
## [14] {NIO} => {Tesla} 0.15 0.5000000 0.30 1.428571 3
## [15] {Apple} => {Tesla} 0.20 0.5714286 0.35 1.632653 4
## [16] {Tesla} => {Apple} 0.20 0.5714286 0.35 1.632653 4
## [17] {Bilibili,Iqiyi} => {Huya} 0.10 1.0000000 0.10 2.857143 2
## [18] {Huya,Iqiyi} => {Bilibili} 0.10 0.5000000 0.20 2.500000 2
## [19] {Bilibili,Huya} => {Iqiyi} 0.10 0.6666667 0.15 3.333333 2
## [20] {Apple,NIO} => {Tesla} 0.10 1.0000000 0.10 2.857143 2
## [21] {NIO,Tesla} => {Apple} 0.10 0.6666667 0.15 1.904762 2
## [22] {Apple,Tesla} => {NIO} 0.10 0.5000000 0.20 1.666667 2
sortedrulesk <- sort(stock_rule,by = "confidence",decreasing = TRUE)
inspect(sortedrulesk[1:15])
## lhs rhs support confidence coverage lift count
## [1] {Nvdia} => {Amd} 0.10 1.0000000 0.10 10.000000 2
## [2] {Amd} => {Nvdia} 0.10 1.0000000 0.10 10.000000 2
## [3] {Alibaba} => {Xpev} 0.10 1.0000000 0.10 2.857143 2
## [4] {Iqiyi} => {Huya} 0.20 1.0000000 0.20 2.857143 4
## [5] {Doyu} => {Huya} 0.20 1.0000000 0.20 2.857143 4
## [6] {Bilibili,Iqiyi} => {Huya} 0.10 1.0000000 0.10 2.857143 2
## [7] {Apple,NIO} => {Tesla} 0.10 1.0000000 0.10 2.857143 2
## [8] {Bilibili} => {Huya} 0.15 0.7500000 0.20 2.142857 3
## [9] {Jd} => {Apple} 0.10 0.6666667 0.15 1.904762 2
## [10] {NIO} => {Xpev} 0.20 0.6666667 0.30 1.904762 4
## [11] {Bilibili,Huya} => {Iqiyi} 0.10 0.6666667 0.15 3.333333 2
## [12] {NIO,Tesla} => {Apple} 0.10 0.6666667 0.15 1.904762 2
## [13] {Huya} => {Iqiyi} 0.20 0.5714286 0.35 2.857143 4
## [14] {Huya} => {Doyu} 0.20 0.5714286 0.35 2.857143 4
## [15] {Xpev} => {NIO} 0.20 0.5714286 0.35 1.904762 4
#From the top 15 conficence rules, we can see that it is pretty obvious. For example, with confidence = 1, it means that whenever A appears, B must appear. In here, it means that there is a strong correlation between a and b in here. For example, Nvidia and AMD are both CPU/GPU manufactor. People who buy Nvdia often buy Amd as well. For Doyu and Huya, it is also another example. Doyu and huya are both from streaming industry so its very possible that they have a confidence = 1.
sortedrulesl <- sort(stock_rule,by = "lift",decreasing = TRUE)
inspect(sortedrulesl[1:15])
## lhs rhs support confidence coverage lift count
## [1] {Nvdia} => {Amd} 0.10 1.0000000 0.10 10.000000 2
## [2] {Amd} => {Nvdia} 0.10 1.0000000 0.10 10.000000 2
## [3] {Bilibili,Huya} => {Iqiyi} 0.10 0.6666667 0.15 3.333333 2
## [4] {Alibaba} => {Xpev} 0.10 1.0000000 0.10 2.857143 2
## [5] {Iqiyi} => {Huya} 0.20 1.0000000 0.20 2.857143 4
## [6] {Doyu} => {Huya} 0.20 1.0000000 0.20 2.857143 4
## [7] {Bilibili,Iqiyi} => {Huya} 0.10 1.0000000 0.10 2.857143 2
## [8] {Apple,NIO} => {Tesla} 0.10 1.0000000 0.10 2.857143 2
## [9] {Huya} => {Iqiyi} 0.20 0.5714286 0.35 2.857143 4
## [10] {Huya} => {Doyu} 0.20 0.5714286 0.35 2.857143 4
## [11] {Iqiyi} => {Bilibili} 0.10 0.5000000 0.20 2.500000 2
## [12] {Bilibili} => {Iqiyi} 0.10 0.5000000 0.20 2.500000 2
## [13] {Huya,Iqiyi} => {Bilibili} 0.10 0.5000000 0.20 2.500000 2
## [14] {Bilibili} => {Huya} 0.15 0.7500000 0.20 2.142857 3
## [15] {Jd} => {Apple} 0.10 0.6666667 0.15 1.904762 2
#When lift >1 if means that a and b are dependent, if =1 means that they are independent, if <1 it means taht they are not related in some how. by the top 15 we can see that nvdia and amd, bilibili,huya and iqiyi have a very high lift, which means that they are highly correlated.
sortedruless <- sort(stock_rule,by = "support",decreasing = TRUE)
inspect(sortedruless[1:15])
## lhs rhs support confidence coverage lift count
## [1] {Iqiyi} => {Huya} 0.20 1.0000000 0.20 2.857143 4
## [2] {Huya} => {Iqiyi} 0.20 0.5714286 0.35 2.857143 4
## [3] {Doyu} => {Huya} 0.20 1.0000000 0.20 2.857143 4
## [4] {Huya} => {Doyu} 0.20 0.5714286 0.35 2.857143 4
## [5] {NIO} => {Xpev} 0.20 0.6666667 0.30 1.904762 4
## [6] {Xpev} => {NIO} 0.20 0.5714286 0.35 1.904762 4
## [7] {Apple} => {Tesla} 0.20 0.5714286 0.35 1.632653 4
## [8] {Tesla} => {Apple} 0.20 0.5714286 0.35 1.632653 4
## [9] {Bilibili} => {Huya} 0.15 0.7500000 0.20 2.142857 3
## [10] {NIO} => {Tesla} 0.15 0.5000000 0.30 1.428571 3
## [11] {Nvdia} => {Amd} 0.10 1.0000000 0.10 10.000000 2
## [12] {Amd} => {Nvdia} 0.10 1.0000000 0.10 10.000000 2
## [13] {Alibaba} => {Xpev} 0.10 1.0000000 0.10 2.857143 2
## [14] {Jd} => {Apple} 0.10 0.6666667 0.15 1.904762 2
## [15] {Iqiyi} => {Bilibili} 0.10 0.5000000 0.20 2.500000 2
#Support measure the occurance. in here, it means that the higher the support, the higher the popularity the stock in the sample group. In here, iqiyi, huya and doyu,NIo etc are very popular stocks among these 20 people.
Visualization and Network3D
#plot of which items are most frequent
itemFrequencyPlot(stocks, topN=20, type="absolute")
subrulesK <- head(sort(sortedrulesk, by="lift"),10)
plot(subrulesK)
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
#a network plot showing the relationship
plot(subrulesK, method="graph", engine="interactive")
#network3D
Rules_DF2<-DATAFRAME(stock_rule, separate = TRUE)
(head(Rules_DF2))
## LHS RHS support confidence coverage lift count
## 1 {Nvdia} {Amd} 0.1 1.0000000 0.10 10.000000 2
## 2 {Amd} {Nvdia} 0.1 1.0000000 0.10 10.000000 2
## 3 {Alibaba} {Xpev} 0.1 1.0000000 0.10 2.857143 2
## 4 {Jd} {Apple} 0.1 0.6666667 0.15 1.904762 2
## 5 {Iqiyi} {Bilibili} 0.1 0.5000000 0.20 2.500000 2
## 6 {Bilibili} {Iqiyi} 0.1 0.5000000 0.20 2.500000 2
str(Rules_DF2)
## 'data.frame': 22 obs. of 7 variables:
## $ LHS : Factor w/ 18 levels "{Nvdia}","{Amd}",..: 1 2 3 4 5 6 5 7 6 8 ...
## $ RHS : Factor w/ 10 levels "{Amd}","{Nvdia}",..: 1 2 3 4 5 6 7 6 7 7 ...
## $ support : num 0.1 0.1 0.1 0.1 0.1 0.1 0.2 0.2 0.15 0.2 ...
## $ confidence: num 1 1 1 0.667 0.5 ...
## $ coverage : num 0.1 0.1 0.1 0.15 0.2 0.2 0.2 0.35 0.2 0.2 ...
## $ lift : num 10 10 2.86 1.9 2.5 ...
## $ count : int 2 2 2 2 2 2 4 4 3 4 ...
Rules_DF2$LHS<-as.character(Rules_DF2$LHS)
Rules_DF2$RHS<-as.character(Rules_DF2$RHS)
Rules_DF2[] <- lapply(Rules_DF2, gsub, pattern='[{]', replacement='')
Rules_DF2[] <- lapply(Rules_DF2, gsub, pattern='[}]', replacement='')
Rules_C<-Rules_DF2[c(1,2,4)]
names(Rules_C) <- c("SourceName", "TargetName", "Weight")
head(Rules_C,30)
## SourceName TargetName Weight
## 1 Nvdia Amd 1
## 2 Amd Nvdia 1
## 3 Alibaba Xpev 1
## 4 Jd Apple 0.666666666666667
## 5 Iqiyi Bilibili 0.5
## 6 Bilibili Iqiyi 0.5
## 7 Iqiyi Huya 1
## 8 Huya Iqiyi 0.571428571428571
## 9 Bilibili Huya 0.75
## 10 Doyu Huya 1
## 11 Huya Doyu 0.571428571428571
## 12 NIO Xpev 0.666666666666667
## 13 Xpev NIO 0.571428571428571
## 14 NIO Tesla 0.5
## 15 Apple Tesla 0.571428571428571
## 16 Tesla Apple 0.571428571428571
## 17 Bilibili,Iqiyi Huya 1
## 18 Huya,Iqiyi Bilibili 0.5
## 19 Bilibili,Huya Iqiyi 0.666666666666667
## 20 Apple,NIO Tesla 1
## 21 NIO,Tesla Apple 0.666666666666667
## 22 Apple,Tesla NIO 0.5
Rules_Sup<-Rules_C
(edgeList<-Rules_Sup)
## SourceName TargetName Weight
## 1 Nvdia Amd 1
## 2 Amd Nvdia 1
## 3 Alibaba Xpev 1
## 4 Jd Apple 0.666666666666667
## 5 Iqiyi Bilibili 0.5
## 6 Bilibili Iqiyi 0.5
## 7 Iqiyi Huya 1
## 8 Huya Iqiyi 0.571428571428571
## 9 Bilibili Huya 0.75
## 10 Doyu Huya 1
## 11 Huya Doyu 0.571428571428571
## 12 NIO Xpev 0.666666666666667
## 13 Xpev NIO 0.571428571428571
## 14 NIO Tesla 0.5
## 15 Apple Tesla 0.571428571428571
## 16 Tesla Apple 0.571428571428571
## 17 Bilibili,Iqiyi Huya 1
## 18 Huya,Iqiyi Bilibili 0.5
## 19 Bilibili,Huya Iqiyi 0.666666666666667
## 20 Apple,NIO Tesla 1
## 21 NIO,Tesla Apple 0.666666666666667
## 22 Apple,Tesla NIO 0.5
MyGraph <- igraph::simplify(igraph::graph.data.frame(edgeList, directed=TRUE))
nodeList <- data.frame(ID = c(0:(igraph::vcount(MyGraph) - 1)),
# because networkD3 library requires IDs to start at 0
nName = igraph::V(MyGraph)$name)
## Node Degree
(nodeList <- cbind(nodeList, nodeDegree=igraph::degree(MyGraph,
v = igraph::V(MyGraph), mode = "all")))
## ID nName nodeDegree
## Nvdia 0 Nvdia 2
## Amd 1 Amd 2
## Alibaba 2 Alibaba 1
## Jd 3 Jd 1
## Iqiyi 4 Iqiyi 5
## Bilibili 5 Bilibili 4
## Huya 6 Huya 6
## Doyu 7 Doyu 2
## NIO 8 NIO 4
## Xpev 9 Xpev 3
## Apple 10 Apple 4
## Tesla 11 Tesla 4
## Bilibili,Iqiyi 12 Bilibili,Iqiyi 1
## Huya,Iqiyi 13 Huya,Iqiyi 1
## Bilibili,Huya 14 Bilibili,Huya 1
## Apple,NIO 15 Apple,NIO 1
## NIO,Tesla 16 NIO,Tesla 1
## Apple,Tesla 17 Apple,Tesla 1
## Betweenness
BetweenNess <- igraph::betweenness(MyGraph,
v = igraph::V(MyGraph),
directed = TRUE)
(nodeList <- cbind(nodeList, nodeBetweenness=BetweenNess))
## ID nName nodeDegree nodeBetweenness
## Nvdia 0 Nvdia 2 0
## Amd 1 Amd 2 0
## Alibaba 2 Alibaba 1 0
## Jd 3 Jd 1 0
## Iqiyi 4 Iqiyi 5 6
## Bilibili 5 Bilibili 4 3
## Huya 6 Huya 6 9
## Doyu 7 Doyu 2 0
## NIO 8 NIO 4 7
## Xpev 9 Xpev 3 3
## Apple 10 Apple 4 2
## Tesla 11 Tesla 4 5
## Bilibili,Iqiyi 12 Bilibili,Iqiyi 1 0
## Huya,Iqiyi 13 Huya,Iqiyi 1 0
## Bilibili,Huya 14 Bilibili,Huya 1 0
## Apple,NIO 15 Apple,NIO 1 0
## NIO,Tesla 16 NIO,Tesla 1 0
## Apple,Tesla 17 Apple,Tesla 1 0
## This can change the BetweenNess value if needed
BetweenNess<-BetweenNess/100
getNodeID <- function(x){
which(x == igraph::V(MyGraph)$name) - 1 #IDs start at 0
}
(getNodeID("elephants"))
## numeric(0)
edgeList <- plyr::ddply(
Rules_Sup, .variables = c("SourceName", "TargetName" , "Weight"),
function (x) data.frame(SourceID = getNodeID(x$SourceName),
TargetID = getNodeID(x$TargetName)))
head(edgeList)
## SourceName TargetName Weight SourceID TargetID
## 1 Alibaba Xpev 1 2 9
## 2 Amd Nvdia 1 1 0
## 3 Apple Tesla 0.571428571428571 10 11
## 4 Apple,NIO Tesla 1 15 11
## 5 Apple,Tesla NIO 0.5 17 8
## 6 Bilibili Huya 0.75 5 6
nrow(edgeList)
## [1] 22
DiceSim <- igraph::similarity.dice(MyGraph, vids = igraph::V(MyGraph), mode = "all")
head(DiceSim)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11]
## [1,] 1 0 0 0 0.0000000 0.0000000 0.0000000 0.0 0.0 0 0
## [2,] 0 1 0 0 0.0000000 0.0000000 0.0000000 0.0 0.0 0 0
## [3,] 0 0 1 0 0.0000000 0.0000000 0.0000000 0.0 0.5 0 0
## [4,] 0 0 0 1 0.0000000 0.0000000 0.0000000 0.0 0.0 0 0
## [5,] 0 0 0 0 1.0000000 0.3333333 0.2857143 0.5 0.0 0 0
## [6,] 0 0 0 0 0.3333333 1.0000000 0.2857143 0.5 0.0 0 0
## [,12] [,13] [,14] [,15] [,16] [,17] [,18]
## [1,] 0.0 0.0 0.0 0.0 0 0 0
## [2,] 0.0 0.0 0.0 0.0 0 0 0
## [3,] 0.0 0.0 0.0 0.0 0 0 0
## [4,] 0.5 0.0 0.0 0.0 0 1 0
## [5,] 0.0 0.5 0.5 0.0 0 0 0
## [6,] 0.0 0.5 0.0 0.5 0 0 0
#Create data frame that contains the Dice similarity between any two vertices
F1 <- function(x) {data.frame(diceSim = DiceSim[x$SourceID +1, x$TargetID + 1])}
#Place a new column in edgeList with the Dice Sim
head(edgeList)
## SourceName TargetName Weight SourceID TargetID
## 1 Alibaba Xpev 1 2 9
## 2 Amd Nvdia 1 1 0
## 3 Apple Tesla 0.571428571428571 10 11
## 4 Apple,NIO Tesla 1 15 11
## 5 Apple,Tesla NIO 0.5 17 8
## 6 Bilibili Huya 0.75 5 6
edgeList <- plyr::ddply(edgeList,
.variables=c("SourceName", "TargetName", "Weight",
"SourceID", "TargetID"),
function(x) data.frame(F1(x)))
head(edgeList)
## SourceName TargetName Weight SourceID TargetID diceSim
## 1 Alibaba Xpev 1 2 9 0.0000000
## 2 Amd Nvdia 1 1 0 0.0000000
## 3 Apple Tesla 0.571428571428571 10 11 0.0000000
## 4 Apple,NIO Tesla 1 15 11 0.0000000
## 5 Apple,Tesla NIO 0.5 17 8 0.0000000
## 6 Bilibili Huya 0.75 5 6 0.2857143
##################################################################################
################## color #################################################
######################################################
COLOR_P <- colorRampPalette(c("#00FF00", "#FF0000"),
bias = nrow(edgeList), space = "rgb",
interpolate = "linear")
COLOR_P
## function (n)
## {
## x <- ramp(seq.int(0, 1, length.out = n))
## if (ncol(x) == 4L)
## rgb(x[, 1L], x[, 2L], x[, 3L], x[, 4L], maxColorValue = 255)
## else rgb(x[, 1L], x[, 2L], x[, 3L], maxColorValue = 255)
## }
## <bytecode: 0x000000002082b088>
## <environment: 0x0000000020a1ffd8>
(colCodes <- COLOR_P(length(unique(edgeList$diceSim))))
## [1] "#00FF00" "#7F7F00" "#FF0000"
edges_col <- sapply(edgeList$diceSim,
function(x) colCodes[which(sort(unique(edgeList$diceSim)) == x)])
nrow(edges_col)
## NULL
## NetworkD3 Object
#https://www.rdocumentation.org/packages/networkD3/versions/0.4/topics/forceNetwork
D3_network_Tweets <- networkD3::forceNetwork(
Links = edgeList, # data frame that contains info about edges
Nodes = nodeList, # data frame that contains info about nodes
Source = "SourceID", # ID of source node
Target = "TargetID", # ID of target node
Value = "Weight", # value from the edge list (data frame) that will be used to value/weight relationship amongst nodes
NodeID = "nName", # value from the node list (data frame) that contains node description we want to use (e.g., node name)
Nodesize = "nodeBetweenness", # value from the node list (data frame) that contains value we want to use for a node size
Group = "nodeDegree", # value from the node list (data frame) that contains value we want to use for node color
height = 700, # Size of the plot (vertical)
width = 900, # Size of the plot (horizontal)
fontSize = 20, # Font size
linkDistance = networkD3::JS("function(d) { return d.value*10; }"), # Function to determine distance between any two nodes, uses variables already defined in forceNetwork function (not variables from a data frame)
linkWidth = networkD3::JS("function(d) { return d.value/10; }"),# Function to determine link/edge thickness, uses variables already defined in forceNetwork function (not variables from a data frame)
opacity = 0.9, # opacity
zoom = TRUE, # ability to zoom when click on the node
opacityNoHover = 0.9, # opacity of labels when static
linkColour = "red" ###"edges_col"red"# edge colors
)
# Plot network
#D3_network_Tweets
# Save network as html file
networkD3::saveNetwork(D3_network_Tweets,
"NetD3_DCR2019_worldNewsL.html", selfcontained = TRUE)