Agenda


Create some of the most routinely used plots to explore data using the geom_* functions:

  • Scatter Plot
  • Bar Plot
  • Box Plot
  • Histogram
  • Line Chart
  • Regression Line

Libraries


library(ggplot2)
library(readr)

Data


ecom <- read_csv('https://raw.githubusercontent.com/rsquaredacademy/datasets/master/web.csv')
## # A tibble: 1,000 x 11
##       id referrer device bouncers n_visit n_pages duration        country
##    <int>    <chr>  <chr>    <chr>   <int>   <dbl>    <dbl>          <chr>
##  1     1   google laptop     true      10       1      693 Czech Republic
##  2     2    yahoo tablet     true       9       1      459          Yemen
##  3     3   direct laptop     true       0       1      996         Brazil
##  4     4     bing tablet    false       3      18      468          China
##  5     5    yahoo mobile     true       9       1      955         Poland
##  6     6    yahoo laptop    false       5       5      135   South Africa
##  7     7    yahoo mobile     true      10       1       75     Bangladesh
##  8     8   direct mobile     true      10       1      908      Indonesia
##  9     9     bing mobile    false       3      19      209    Netherlands
## 10    10   google mobile     true       6       1      208 Czech Republic
## # ... with 990 more rows, and 3 more variables: purchase <chr>,
## #   order_items <dbl>, order_value <dbl>

Data Dictionary


  • id: row id
  • referrer: referrer website/search engine
  • os: operating system
  • browser: browser
  • device: device used to visit the website
  • n_pages: number of pages visited
  • duration: time spent on the website (in seconds)
  • repeat: frequency of visits
  • country: country of origin
  • purchase: whether visitor purchased
  • order_value: order value of visitor (in dollars)

Point


ggplot(mtcars, aes(x = disp, y = mpg)) + 
  geom_point()

Regression Line


  • geom_abline()
  • geom_smooth()

Regression Line


ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point() + 
  geom_abline(intercept = 37.285, slope = -5.344)

Regression Line


ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_smooth(method = 'lm', se = TRUE)

Regression Line


ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_smooth(method = 'loess', se = FALSE)

Horizontal/Vertical Lines


  • geom_hline()
  • geom_vline()

Horizontal Line


ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point() +
  geom_hline(yintercept = 30) 

Vertical Line


ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point() +
  geom_vline(xintercept = 5) 

Bar Plot


Frequency


ggplot(ecom, aes(x = factor(device))) +
  geom_bar()

Weight


ggplot(ecom, aes(x = factor(device))) +
  geom_bar(aes(weight = order_value))

Columns


device <- c('laptop', 'mobile', 'tablet')
visits <- c(30000, 12000, 5000)
traffic <- tibble::tibble(device, visits)
ggplot(traffic, aes(x = device, y = visits)) +
  geom_col(fill = 'blue') 

Boxplot


Data


tidy_returns <- read_csv('https://raw.githubusercontent.com/rsquaredacademy/datasets/master/tidy_tickers.csv')
## # A tibble: 1,254 x 2
##    stock   returns
##    <chr>     <dbl>
##  1  AAPL  1.377845
##  2  AAPL  2.834412
##  3  AAPL -0.039360
##  4  AAPL  0.108261
##  5  AAPL  1.643570
##  6  AAPL  0.068894
##  7  AAPL -0.560975
##  8  AAPL  0.551140
##  9  AAPL -0.216522
## 10  AAPL -0.108253
## # ... with 1,244 more rows

Box Plot


ggplot(tidy_returns) +
  geom_boxplot(aes(x = factor(stock), y = returns))

Add Jitter


ggplot(ecom, aes(x = factor(device), y = n_pages)) +
  geom_boxplot() +
  geom_jitter()

Histogram


Histogram


ggplot(ecom, aes(x = duration)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Bins


ggplot(ecom, aes(x = duration)) +
  geom_histogram(bins = 5)

Line


Data


users <- read_csv('https://raw.githubusercontent.com/rsquaredacademy/datasets/master/users_device.csv',
  col_types = list(col_date(format = "%m/%d/%y"), col_integer(),
                   col_integer(), col_integer()))
## # A tibble: 33 x 4
##          Date Desktop Mobile Tablet
##        <date>   <int>  <int>  <int>
##  1 2017-10-11    2345    876     92
##  2 2017-10-12    2173    784    111
##  3 2017-10-13    1826    772     97
##  4 2017-10-14    1178   1032    155
##  5 2017-10-15    1239    747    105
##  6 2017-10-16    2158    801     85
##  7 2017-10-17    2682   1307    127
##  8 2017-10-18    2252   1110    112
##  9 2017-10-19    2210    891     93
## 10 2017-10-20    2040    824     94
## # ... with 23 more rows

Line Chart


ggplot(users, aes(Date, Desktop)) +
  geom_line()

Jitter


ggplot(ecom, aes(x = factor(device), y = duration)) +
  geom_jitter()

Jitter Width & Height


ggplot(ecom, aes(x = factor(device), y = duration)) +
  geom_jitter(width = 0.25, height = 0.5)

Label


ggplot(mtcars, aes(disp, mpg, label = rownames(mtcars))) +
  geom_label()

Text


ggplot(mtcars, aes(disp, mpg, label = rownames(mtcars))) +
  geom_text(check_overlap = TRUE, size = 2)

Text


ggplot(mtcars, aes(x = disp, y = mpg, label = rownames(mtcars))) +
  geom_point() + 
  geom_text(aes(color = cyl), hjust = 0, nudge_x = 0.05, 
            size = 2, angle = 45)