mba_data <- read_excel("online-retail.xlsx")
transactions <- mbar_prep_data(mba_data, InvoiceNo, Description)
head(transactions)
## # A tibble: 6 x 1,114
## item_1 item_2 item_3 item_4 item_5 item_6 item_7 item_8 item_9 item_10
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 BATH ~ "" "" "" "" "" "" "" "" ""
## 2 PAPER~ "" "" "" "" "" "" "" "" ""
## 3 VICTO~ "" "" "" "" "" "" "" "" ""
## 4 JAM M~ "" "" "" "" "" "" "" "" ""
## 5 Disco~ "" "" "" "" "" "" "" "" ""
## 6 SET O~ "" "" "" "" "" "" "" "" ""
## # ... with 1,104 more variables: item_11 <chr>, item_12 <chr>,
## # item_13 <chr>, item_14 <chr>, item_15 <chr>, item_16 <chr>,
## # item_17 <chr>, item_18 <chr>, item_19 <chr>, item_20 <chr>,
## # item_21 <chr>, item_22 <chr>, item_23 <chr>, item_24 <chr>,
## # item_25 <chr>, item_26 <chr>, item_27 <chr>, item_28 <chr>,
## # item_29 <chr>, item_30 <chr>, item_31 <chr>, item_32 <chr>,
## # item_33 <chr>, item_34 <chr>, item_35 <chr>, item_36 <chr>,
## # item_37 <chr>, item_38 <chr>, item_39 <chr>, item_40 <chr>,
## # item_41 <chr>, item_42 <chr>, item_43 <chr>, item_44 <chr>,
## # item_45 <chr>, item_46 <chr>, item_47 <chr>, item_48 <chr>,
## # item_49 <chr>, item_50 <chr>, item_51 <chr>, item_52 <chr>,
## # item_53 <chr>, item_54 <chr>, item_55 <chr>, item_56 <chr>,
## # item_57 <chr>, item_58 <chr>, item_59 <chr>, item_60 <chr>,
## # item_61 <chr>, item_62 <chr>, item_63 <chr>, item_64 <chr>,
## # item_65 <chr>, item_66 <chr>, item_67 <chr>, item_68 <chr>,
## # item_69 <chr>, item_70 <chr>, item_71 <chr>, item_72 <chr>,
## # item_73 <chr>, item_74 <chr>, item_75 <chr>, item_76 <chr>,
## # item_77 <chr>, item_78 <chr>, item_79 <chr>, item_80 <chr>,
## # item_81 <chr>, item_82 <chr>, item_83 <chr>, item_84 <chr>,
## # item_85 <chr>, item_86 <chr>, item_87 <chr>, item_88 <chr>,
## # item_89 <chr>, item_90 <chr>, item_91 <chr>, item_92 <chr>,
## # item_93 <chr>, item_94 <chr>, item_95 <chr>, item_96 <chr>,
## # item_97 <chr>, item_98 <chr>, item_99 <chr>, item_100 <chr>,
## # item_101 <chr>, item_102 <chr>, item_103 <chr>, item_104 <chr>,
## # item_105 <chr>, item_106 <chr>, item_107 <chr>, item_108 <chr>,
## # item_109 <chr>, item_110 <chr>, ...
## [1] 20.92313
## [1] 10
## # A tibble: 4,212 x 2
## Description count
## <chr> <int>
## 1 WHITE HANGING HEART T-LIGHT HOLDER 2369
## 2 REGENCY CAKESTAND 3 TIER 2200
## 3 JUMBO BAG RED RETROSPOT 2159
## 4 PARTY BUNTING 1727
## 5 LUNCH BAG RED RETROSPOT 1638
## 6 ASSORTED COLOUR BIRD ORNAMENT 1501
## 7 SET OF 3 CAKE TINS PANTRY DESIGN 1473
## 8 <NA> 1454
## 9 PACK OF 72 RETROSPOT CAKE CASES 1385
## 10 LUNCH BAG BLACK SKULL. 1350
## # ... with 4,202 more rows
total_revenue <-
mba_data %>%
group_by(InvoiceNo) %>%
summarize(order_sum = sum(UnitPrice)) %>%
pull(order_sum) %>%
sum()
total_transactions <-
mba_data %>%
group_by(InvoiceNo) %>%
summarize(n()) %>%
nrow()
total_revenue / total_transactions
## [1] 96.47892
## transactions in sparse format with
## 25901 transactions (rows) and
## 10085 items (columns)
## transactions as itemMatrix in sparse format with
## 25901 rows (elements/itemsets/transactions) and
## 10085 columns (items) and a density of 0.001660018
##
## most frequent items:
## WHITE HANGING HEART T-LIGHT HOLDER REGENCY CAKESTAND 3 TIER
## 1999 1914
## JUMBO BAG RED RETROSPOT PARTY BUNTING
## 1806 1488
## LUNCH BAG RED RETROSPOT (Other)
## 1404 425005
##
## element (itemset/transaction) length distribution:
## sizes
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 1454 4578 1727 1208 942 891 781 715 696 683 612 642 547 530 543
## 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
## 555 537 479 459 491 428 405 328 311 280 248 261 235 221 233
## 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
## 224 175 174 145 149 139 122 119 100 117 98 94 102 93 72
## 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
## 73 74 71 69 68 59 70 49 49 54 57 42 32 42 39
## 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
## 34 40 22 27 30 24 34 28 25 21 23 26 14 17 24
## 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
## 11 18 14 13 10 16 18 15 10 9 16 13 16 13 7
## 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
## 8 12 12 8 7 7 4 7 9 5 8 8 4 5 7
## 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
## 2 3 7 9 4 7 4 2 7 1 1 4 7 6 2
## 120 121 122 123 124 125 126 127 129 130 131 132 133 134 135
## 3 5 4 4 2 5 6 2 1 4 3 6 6 3 4
## 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
## 3 2 1 1 3 8 5 3 4 4 6 2 3 1 4
## 151 152 153 154 155 156 157 158 159 160 162 163 164 167 168
## 3 2 4 7 3 3 5 2 4 5 1 2 1 3 5
## 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
## 2 2 4 3 1 3 5 1 2 2 2 2 1 2 1
## 184 185 186 187 189 190 192 193 194 196 197 198 201 202 204
## 2 1 1 2 2 1 1 5 1 2 3 2 1 1 2
## 205 206 207 208 209 212 213 215 219 220 224 226 227 228 230
## 2 1 3 3 2 1 2 2 7 1 3 3 1 1 2
## 232 234 236 238 240 241 244 248 249 250 252 256 257 258 260
## 1 2 1 2 2 2 1 1 2 2 1 1 1 1 2
## 261 263 265 266 270 272 281 284 285 298 299 301 303 304 305
## 1 2 1 1 1 1 1 1 2 1 2 1 1 1 3
## 312 314 316 320 321 326 327 329 332 333 338 339 341 344 348
## 2 1 1 2 1 1 1 1 1 1 1 1 1 2 1
## 350 360 365 367 375 391 394 398 400 402 405 411 419 422 429
## 1 2 1 1 3 1 1 1 1 1 1 1 2 1 1
## 431 442 447 460 468 471 477 509 514 530 587 627 1114
## 2 1 1 1 1 1 1 1 1 1 1 1 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 2.00 8.00 16.74 20.00 1114.00
##
## includes extended item information - examples:
## labels
## 1 *Boombox Ipod Classic
## 2 *USB Office Mirror Ball
## 3 ?
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.009 1
## maxlen target ext
## 4 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 233
##
## set item appearances ...[0 item(s)] done [0.02s].
## set transactions ...[10085 item(s), 25901 transaction(s)] done [1.30s].
## sorting and recoding items ... [508 item(s)] done [0.02s].
## creating transaction tree ... done [0.05s].
## checking subsets of size 1 2 3 4
## Warning in apriori(basket_data, parameter = list(supp = 0.009, conf =
## 0.8, : Mining stopped (maxlen reached). Only patterns up to a length of 4
## returned!
## done [0.08s].
## writing ... [22 rule(s)] done [0.00s].
## creating S4 object ... done [0.02s].
## set of 22 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4
## 11 9 2
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 2.000 2.500 2.591 3.000 4.000
##
## summary of quality measures:
## support confidence lift count
## Min. :0.009034 Min. :0.8035 Min. :22.59 Min. :234.0
## 1st Qu.:0.010453 1st Qu.:0.8530 1st Qu.:25.02 1st Qu.:270.8
## Median :0.013223 Median :0.8868 Median :55.94 Median :342.5
## Mean :0.012760 Mean :0.9120 Mean :48.55 Mean :330.5
## 3rd Qu.:0.014362 3rd Qu.:1.0000 3rd Qu.:61.23 3rd Qu.:372.0
## Max. :0.018339 Max. :1.0000 Max. :71.30 Max. :475.0
##
## mining info:
## data ntransactions support confidence
## basket_data 25901 0.009 0.8
## lhs rhs support confidence lift count
## [1] {BACK DOOR} => {KEY FOB} 0.009613528 1.0000000 61.23168 249
## [2] {SET 3 RETROSPOT TEA} => {SUGAR} 0.014362380 1.0000000 69.62634 372
## [3] {SUGAR} => {SET 3 RETROSPOT TEA} 0.014362380 1.0000000 69.62634 372
## [4] {SET 3 RETROSPOT TEA} => {COFFEE} 0.014362380 1.0000000 55.94168 372
## [5] {SUGAR} => {COFFEE} 0.014362380 1.0000000 55.94168 372
## [6] {SHED} => {KEY FOB} 0.011273696 1.0000000 61.23168 292
## [7] {SET 3 RETROSPOT TEA,
## SUGAR} => {COFFEE} 0.014362380 1.0000000 55.94168 372
## [8] {COFFEE,
## SET 3 RETROSPOT TEA} => {SUGAR} 0.014362380 1.0000000 69.62634 372
## [9] {COFFEE,
## SUGAR} => {SET 3 RETROSPOT TEA} 0.014362380 1.0000000 69.62634 372
## [10] {PINK REGENCY TEACUP AND SAUCER,
## REGENCY CAKESTAND 3 TIER,
## ROSES REGENCY TEACUP AND SAUCER} => {GREEN REGENCY TEACUP AND SAUCER} 0.009999614 0.8900344 25.16679 259
## lhs rhs support
## [1] {SET 3 RETROSPOT TEA,SUGAR} => {COFFEE} 0.01436238
## [2] {COFFEE,SET 3 RETROSPOT TEA} => {SUGAR} 0.01436238
## [3] {COFFEE,SUGAR} => {SET 3 RETROSPOT TEA} 0.01436238
## confidence lift count
## [1] 1 55.94168 372
## [2] 1 69.62634 372
## [3] 1 69.62634 372
## lhs rhs support confidence lift count
## [1] {REGENCY TEA PLATE PINK} => {REGENCY TEA PLATE GREEN} 0.009034400 0.8863636 71.29722 234
## [2] {BACK DOOR} => {KEY FOB} 0.009613528 1.0000000 61.23168 249
## [3] {SET 3 RETROSPOT TEA} => {SUGAR} 0.014362380 1.0000000 69.62634 372
## [4] {SUGAR} => {SET 3 RETROSPOT TEA} 0.014362380 1.0000000 69.62634 372
## [5] {SET 3 RETROSPOT TEA} => {COFFEE} 0.014362380 1.0000000 55.94168 372
## [6] {COFFEE} => {SET 3 RETROSPOT TEA} 0.014362380 0.8034557 55.94168 372
## [7] {SUGAR} => {COFFEE} 0.014362380 1.0000000 55.94168 372
## [8] {COFFEE} => {SUGAR} 0.014362380 0.8034557 55.94168 372
## [9] {REGENCY TEA PLATE GREEN} => {REGENCY TEA PLATE ROSES} 0.010347091 0.8322981 55.99313 268
## [10] {SHED} => {KEY FOB} 0.011273696 1.0000000 61.23168 292
## [11] {SET/6 RED SPOTTY PAPER CUPS} => {SET/6 RED SPOTTY PAPER PLATES} 0.012084476 0.8087855 44.38211 313
## [12] {SET/20 RED RETROSPOT PAPER NAPKINS,
## SET/6 RED SPOTTY PAPER CUPS} => {SET/6 RED SPOTTY PAPER PLATES} 0.009111617 0.8872180 48.68609 236
## [13] {PINK REGENCY TEACUP AND SAUCER,
## ROSES REGENCY TEACUP AND SAUCER} => {GREEN REGENCY TEACUP AND SAUCER} 0.018339060 0.8828996 24.96505 475
## [14] {GREEN REGENCY TEACUP AND SAUCER,
## PINK REGENCY TEACUP AND SAUCER} => {ROSES REGENCY TEACUP AND SAUCER} 0.018339060 0.8512545 22.59051 475
## [15] {PINK REGENCY TEACUP AND SAUCER,
## REGENCY CAKESTAND 3 TIER} => {ROSES REGENCY TEACUP AND SAUCER} 0.011235087 0.8584071 22.78033 291
## [16] {PINK REGENCY TEACUP AND SAUCER,
## REGENCY CAKESTAND 3 TIER} => {GREEN REGENCY TEACUP AND SAUCER} 0.011312305 0.8643068 24.43931 293
## [17] {STRAWBERRY CHARLOTTE BAG,
## WOODLAND CHARLOTTE BAG} => {RED RETROSPOT CHARLOTTE BAG} 0.010771785 0.8110465 23.65644 279
## [18] {PINK REGENCY TEACUP AND SAUCER,
## REGENCY CAKESTAND 3 TIER,
## ROSES REGENCY TEACUP AND SAUCER} => {GREEN REGENCY TEACUP AND SAUCER} 0.009999614 0.8900344 25.16679 259
## [19] {GREEN REGENCY TEACUP AND SAUCER,
## PINK REGENCY TEACUP AND SAUCER,
## REGENCY CAKESTAND 3 TIER} => {ROSES REGENCY TEACUP AND SAUCER} 0.009999614 0.8839590 23.45843 259
sugar_rules <- apriori(basket_data, parameter = list(supp = 0.009, conf = 0.8),
appearance = list(default = "lhs", rhs = "SUGAR"))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.009 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 233
##
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[10085 item(s), 25901 transaction(s)] done [1.23s].
## sorting and recoding items ... [508 item(s)] done [0.03s].
## creating transaction tree ... done [0.11s].
## checking subsets of size 1 2 3 4 done [0.09s].
## writing ... [3 rule(s)] done [0.00s].
## creating S4 object ... done [0.03s].
## lhs rhs support confidence lift
## [1] {SET 3 RETROSPOT TEA} => {SUGAR} 0.01436238 1.0000000 69.62634
## [2] {COFFEE,SET 3 RETROSPOT TEA} => {SUGAR} 0.01436238 1.0000000 69.62634
## [3] {COFFEE} => {SUGAR} 0.01436238 0.8034557 55.94168
## count
## [1] 372
## [2] 372
## [3] 372
sugar_rules <- apriori(basket_data, parameter = list(supp = 0.009, conf = 0.8),
appearance = list(default = "rhs", lhs = "SUGAR"))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.009 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 233
##
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[10085 item(s), 25901 transaction(s)] done [1.37s].
## sorting and recoding items ... [508 item(s)] done [0.03s].
## creating transaction tree ... done [0.06s].
## checking subsets of size 1 2 done [0.02s].
## writing ... [2 rule(s)] done [0.00s].
## creating S4 object ... done [0.03s].
## lhs rhs support confidence lift count
## [1] {SUGAR} => {SET 3 RETROSPOT TEA} 0.01436238 1 69.62634 372
## [2] {SUGAR} => {COFFEE} 0.01436238 1 55.94168 372
supp_rules <- sort(rules, by = 'support', decreasing = TRUE)
top_rules <- supp_rules[1:10]
inspect(top_rules)
## lhs rhs support confidence lift count
## [1] {PINK REGENCY TEACUP AND SAUCER,
## ROSES REGENCY TEACUP AND SAUCER} => {GREEN REGENCY TEACUP AND SAUCER} 0.01833906 0.8828996 24.96505 475
## [2] {GREEN REGENCY TEACUP AND SAUCER,
## PINK REGENCY TEACUP AND SAUCER} => {ROSES REGENCY TEACUP AND SAUCER} 0.01833906 0.8512545 22.59051 475
## [3] {SET 3 RETROSPOT TEA} => {SUGAR} 0.01436238 1.0000000 69.62634 372
## [4] {SUGAR} => {SET 3 RETROSPOT TEA} 0.01436238 1.0000000 69.62634 372
## [5] {SET 3 RETROSPOT TEA} => {COFFEE} 0.01436238 1.0000000 55.94168 372
## [6] {COFFEE} => {SET 3 RETROSPOT TEA} 0.01436238 0.8034557 55.94168 372
## [7] {SUGAR} => {COFFEE} 0.01436238 1.0000000 55.94168 372
## [8] {COFFEE} => {SUGAR} 0.01436238 0.8034557 55.94168 372
## [9] {SET 3 RETROSPOT TEA,
## SUGAR} => {COFFEE} 0.01436238 1.0000000 55.94168 372
## [10] {COFFEE,
## SET 3 RETROSPOT TEA} => {SUGAR} 0.01436238 1.0000000 69.62634 372
conf_rules <- sort(rules, by = 'confidence', decreasing = TRUE)
top_rules <- conf_rules[1:10]
inspect(top_rules)
## lhs rhs support confidence lift count
## [1] {BACK DOOR} => {KEY FOB} 0.009613528 1.0000000 61.23168 249
## [2] {SET 3 RETROSPOT TEA} => {SUGAR} 0.014362380 1.0000000 69.62634 372
## [3] {SUGAR} => {SET 3 RETROSPOT TEA} 0.014362380 1.0000000 69.62634 372
## [4] {SET 3 RETROSPOT TEA} => {COFFEE} 0.014362380 1.0000000 55.94168 372
## [5] {SUGAR} => {COFFEE} 0.014362380 1.0000000 55.94168 372
## [6] {SHED} => {KEY FOB} 0.011273696 1.0000000 61.23168 292
## [7] {SET 3 RETROSPOT TEA,
## SUGAR} => {COFFEE} 0.014362380 1.0000000 55.94168 372
## [8] {COFFEE,
## SET 3 RETROSPOT TEA} => {SUGAR} 0.014362380 1.0000000 69.62634 372
## [9] {COFFEE,
## SUGAR} => {SET 3 RETROSPOT TEA} 0.014362380 1.0000000 69.62634 372
## [10] {PINK REGENCY TEACUP AND SAUCER,
## REGENCY CAKESTAND 3 TIER,
## ROSES REGENCY TEACUP AND SAUCER} => {GREEN REGENCY TEACUP AND SAUCER} 0.009999614 0.8900344 25.16679 259
lift_rules <- sort(rules, by = 'lift', decreasing = TRUE)
top_rules <- lift_rules[1:10]
inspect(top_rules)
## lhs rhs support confidence lift count
## [1] {REGENCY TEA PLATE PINK} => {REGENCY TEA PLATE GREEN} 0.009034400 0.8863636 71.29722 234
## [2] {SET 3 RETROSPOT TEA} => {SUGAR} 0.014362380 1.0000000 69.62634 372
## [3] {SUGAR} => {SET 3 RETROSPOT TEA} 0.014362380 1.0000000 69.62634 372
## [4] {COFFEE,
## SET 3 RETROSPOT TEA} => {SUGAR} 0.014362380 1.0000000 69.62634 372
## [5] {COFFEE,
## SUGAR} => {SET 3 RETROSPOT TEA} 0.014362380 1.0000000 69.62634 372
## [6] {BACK DOOR} => {KEY FOB} 0.009613528 1.0000000 61.23168 249
## [7] {SHED} => {KEY FOB} 0.011273696 1.0000000 61.23168 292
## [8] {REGENCY TEA PLATE GREEN} => {REGENCY TEA PLATE ROSES} 0.010347091 0.8322981 55.99313 268
## [9] {SET 3 RETROSPOT TEA} => {COFFEE} 0.014362380 1.0000000 55.94168 372
## [10] {COFFEE} => {SET 3 RETROSPOT TEA} 0.014362380 0.8034557 55.94168 372