Agenda


  • detect patterns
  • count occurence of patterns
  • split strings
  • replace strings
  • extract patterns
  • locate patterns

Libraries


library(stringr)
library(dplyr)
library(readr)

Data


mockstring <- read_csv('https://raw.githubusercontent.com/rsquaredacademy/datasets/master/mock_strings.csv')
## # A tibble: 1,000 x 12
##       id image_url  domain imageurl email filename phone address url      
##    <int> <chr>      <chr>  <chr>    <chr> <chr>    <chr> <chr>   <chr>    
##  1     1 https://r~ addto~ http://~ mnew~ PedeMal~ 66-(~ 8 Anha~ https://~
##  2     2 https://r~ gmpg.~ http://~ mdan~ Loborti~ 351-~ 697 Ea~ http://d~
##  3     3 https://r~ samsu~ http://~ hgir~ CongueD~ 33-(~ 89 Dot~ https://~
##  4     4 https://r~ spoti~ http://~ pmcm~ Eleifen~ 86-(~ 98135 ~ http://i~
##  5     5 https://r~ wunde~ http://~ dris~ PurusPh~ 223-~ 7814 P~ https://~
##  6     6 https://r~ alexa~ http://~ cphl~ Element~ 420-~ 4897 L~ https://~
##  7     7 https://r~ googl~ http://~ kdod~ Mattis.~ 1-(7~ 53541 ~ http://v~
##  8     8 https://r~ ed.gov http://~ vhou~ PurusEu~ 62-(~ 4819 H~ https://~
##  9     9 https://r~ jigsy~ http://~ rdik~ JustoEt~ 1-(6~ 68096 ~ https://~
## 10    10 https://r~ jugem~ http://~ tdud~ Ante.ti~ 30-(~ 9595 S~ https://~
## # ... with 990 more rows, and 3 more variables: full_name <chr>,
## #   currency <chr>, passwords <chr>

Case Study


  • extract domain name from random email ids
  • extract image type from url
  • extract image dimension from url
  • extract extension from domain name
  • extract http protocol from url
  • extract domain name from url
  • extract extension from url
  • extract file type from url

Sample Data


mock_data
## # A tibble: 10 x 4
##    email                        address            full_name      currency
##    <chr>                        <chr>              <chr>          <chr>   
##  1 mnewburn0@fastcompany.com    8 Anhalt Crossing  Mufi Ruit      ¥34.37  
##  2 mdankersley1@digg.com        697 East Avenue    Leese Furmagi~ $67.37  
##  3 hgirhard2@altervista.org     89 Dottie Circle   Blakelee Wils~ €33,85  
##  4 pmcmenamy3@sciencedirect.com 98135 Blue Bill P~ Terencio McIl~ €42,89  
##  5 drisbrough4@bandcamp.com     7814 Pennsylvania~ Debee McErlai~ €13,19  
##  6 cphlippi5@surveymonkey.com   4897 Little Fleur~ Fran Painten   ¥87.35  
##  7 kdodswell6@un.org            53541 Morrow Cent~ Frasco Bowich  $34.89  
##  8 vhourihane7@ovh.net          4819 Hermina Park~ Car Ponten     ¥41.66  
##  9 rdike8@timesonline.co.uk     68096 Monument Pa~ Tades Checcuc~ €70,80  
## 10 tdudbridge9@clickbank.net    9595 Spaight Aven~ Wilton Kemmey  €62,76

Detect @




Detect @


str_detect(mock_data$email, pattern = "@")
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

Count @




Count @


str_count(mock_data$email, pattern = "@")
##  [1] 1 1 1 1 1 1 1 1 1 1

Concatenate




Concatenate


str_c("email id:", mock_data$email)
##  [1] "email id:mnewburn0@fastcompany.com"   
##  [2] "email id:mdankersley1@digg.com"       
##  [3] "email id:hgirhard2@altervista.org"    
##  [4] "email id:pmcmenamy3@sciencedirect.com"
##  [5] "email id:drisbrough4@bandcamp.com"    
##  [6] "email id:cphlippi5@surveymonkey.com"  
##  [7] "email id:kdodswell6@un.org"           
##  [8] "email id:vhourihane7@ovh.net"         
##  [9] "email id:rdike8@timesonline.co.uk"    
## [10] "email id:tdudbridge9@clickbank.net"

Split




Split


str_split(mock_data$email, pattern = "@")
## [[1]]
## [1] "mnewburn0"       "fastcompany.com"
## 
## [[2]]
## [1] "mdankersley1" "digg.com"    
## 
## [[3]]
## [1] "hgirhard2"      "altervista.org"
## 
## [[4]]
## [1] "pmcmenamy3"        "sciencedirect.com"
## 
## [[5]]
## [1] "drisbrough4"  "bandcamp.com"
## 
## [[6]]
## [1] "cphlippi5"        "surveymonkey.com"
## 
## [[7]]
## [1] "kdodswell6" "un.org"    
## 
## [[8]]
## [1] "vhourihane7" "ovh.net"    
## 
## [[9]]
## [1] "rdike8"            "timesonline.co.uk"
## 
## [[10]]
## [1] "tdudbridge9"   "clickbank.net"

Sort




Sort


str_sort(mock_data$email)
##  [1] "cphlippi5@surveymonkey.com"   "drisbrough4@bandcamp.com"    
##  [3] "hgirhard2@altervista.org"     "kdodswell6@un.org"           
##  [5] "mdankersley1@digg.com"        "mnewburn0@fastcompany.com"   
##  [7] "pmcmenamy3@sciencedirect.com" "rdike8@timesonline.co.uk"    
##  [9] "tdudbridge9@clickbank.net"    "vhourihane7@ovh.net"

Sort




Sort


str_sort(mock_data$email, descending = TRUE)
##  [1] "cphlippi5@surveymonkey.com"   "drisbrough4@bandcamp.com"    
##  [3] "hgirhard2@altervista.org"     "kdodswell6@un.org"           
##  [5] "mdankersley1@digg.com"        "mnewburn0@fastcompany.com"   
##  [7] "pmcmenamy3@sciencedirect.com" "rdike8@timesonline.co.uk"    
##  [9] "tdudbridge9@clickbank.net"    "vhourihane7@ovh.net"

Case




Case


str_to_upper(mock_data$full_name)
##  [1] "MUFI RUIT"          "LEESE FURMAGIER"    "BLAKELEE WILSHIRE" 
##  [4] "TERENCIO MCILLRICK" "DEBEE MCERLAINE"    "FRAN PAINTEN"      
##  [7] "FRASCO BOWICH"      "CAR PONTEN"         "TADES CHECCUCCI"   
## [10] "WILTON KEMMEY"

Replace




Replace


str_replace(mock_data$address, "Street", "ST")
##  [1] "8 Anhalt Crossing"          "697 East Avenue"           
##  [3] "89 Dottie Circle"           "98135 Blue Bill Park Drive"
##  [5] "7814 Pennsylvania ST"       "4897 Little Fleur Drive"   
##  [7] "53541 Morrow Center"        "4819 Hermina Parkway"      
##  [9] "68096 Monument Park"        "9595 Spaight Avenue"

Extract




Extract


str_extract(mock_data$email, pattern = "com")
##  [1] "com" "com" NA    "com" "com" "com" NA    NA    NA    NA

Match




Match


str_match(mock_data$email, pattern = "com")
##       [,1] 
##  [1,] "com"
##  [2,] "com"
##  [3,] NA   
##  [4,] "com"
##  [5,] "com"
##  [6,] "com"
##  [7,] NA   
##  [8,] NA   
##  [9,] NA   
## [10,] NA

Index




Index


str_which(mock_data$email, pattern = "com")
## [1] 1 2 4 5 6

Locate




Locate


str_locate(mock_data$email, pattern = "com")
##       start end
##  [1,]    15  17
##  [2,]    19  21
##  [3,]    NA  NA
##  [4,]    26  28
##  [5,]    22  24
##  [6,]    24  26
##  [7,]    NA  NA
##  [8,]    NA  NA
##  [9,]    NA  NA
## [10,]    NA  NA

Length




Extract




Extract


str_sub(mock_data$currency, start = 1, end = 1)
##  [1] "Â¥" "$" "\200" "\200" "\200" "Â¥" "$" "Â¥" "\200" "\200"

Extract Word




Word


word(mock_data$full_name, 1)
##  [1] "Mufi"     "Leese"    "Blakelee" "Terencio" "Debee"    "Fran"    
##  [7] "Frasco"   "Car"      "Tades"    "Wilton"