library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(skimr)
divvy_2004 <- read.csv('202004-divvy-tripdata.csv')
divvy_2005 <- read.csv('202005-divvy-tripdata.csv')
divvy_2006 <- read.csv('202006-divvy-tripdata.csv')
divvy_2007 <- read.csv('202007-divvy-tripdata.csv')
divvy_2008 <- read.csv('202008-divvy-tripdata.csv')
divvy_2009 <- read.csv('202009-divvy-tripdata.csv')
divvy_2010 <- read.csv('202010-divvy-tripdata.csv')
divvy_2011 <- read.csv('202011-divvy-tripdata.csv')
divvy_2012 <- read.csv('202012-divvy-tripdata.csv')
divvy_2101 <- read.csv('202101-divvy-tripdata.csv')
divvy_2102 <- read.csv('202102-divvy-tripdata.csv')
divvy_2103 <- read.csv('202103-divvy-tripdata.csv')
Before merging individual files into one integrated file, I should take a look at each of the data tables and check the column names and the data format of each column. The data format in each column of every month must match; if it differs across the data tables, integration becomes impossible.
colnames(divvy_2007)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
str(divvy_2007)
## 'data.frame': 551480 obs. of 13 variables:
## $ ride_id : chr "762198876D69004D" "BEC9C9FBA0D4CF1B" "D2FD8EA432C77EC1" "54AE594E20B35881" ...
## $ rideable_type : chr "docked_bike" "docked_bike" "docked_bike" "docked_bike" ...
## $ started_at : chr "2020-07-09 15:22:02" "2020-07-24 23:56:30" "2020-07-08 19:49:07" "2020-07-17 19:06:42" ...
## $ ended_at : chr "2020-07-09 15:25:52" "2020-07-25 00:20:17" "2020-07-08 19:56:22" "2020-07-17 19:27:38" ...
## $ start_station_name: chr "Ritchie Ct & Banks St" "Halsted St & Roscoe St" "Lake Shore Dr & Diversey Pkwy" "LaSalle St & Illinois St" ...
## $ start_station_id : int 180 299 329 181 268 635 113 211 176 31 ...
## $ end_station_name : chr "Wells St & Evergreen Ave" "Broadway & Ridge Ave" "Clark St & Wellington Ave" "Clark St & Armitage Ave" ...
## $ end_station_id : int 291 461 156 94 301 289 140 31 191 142 ...
## $ start_lat : num 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num -87.6 -87.6 -87.6 -87.6 -87.6 ...
## $ end_lat : num 41.9 42 41.9 41.9 41.9 ...
## $ end_lng : num -87.6 -87.7 -87.6 -87.6 -87.6 ...
## $ member_casual : chr "member" "member" "casual" "casual" ...
This command (compare_df_cols) let us know if columns will successfully bind together by rows:
compare_df_cols(divvy_2004,divvy_2005,divvy_2006,divvy_2007,divvy_2008,divvy_2009,divvy_2010,divvy_2011,divvy_2012,divvy_2101,divvy_2102,divvy_2103, return = "mismatch")
## column_name divvy_2004 divvy_2005 divvy_2006 divvy_2007 divvy_2008
## 1 end_station_id integer integer integer integer integer
## 2 start_station_id integer integer integer integer integer
## divvy_2009 divvy_2010 divvy_2011 divvy_2012 divvy_2101 divvy_2102 divvy_2103
## 1 integer integer integer character character character character
## 2 integer integer integer character character character character
Using the command (compare_df_cols) that matches columns of datasets and returns the mismatched columns, we can see the column end station id and column start_station_id have different data formats in several datasets. From December 2020 to March 2021, it is a character, and before December, it is an integer. Integration is possible only when those are changed to the same data type.
If the data format is different, integration into one file is impossible, and the data format is unmatched. The ideal way is to unify all columns into same data format, the target columns are the end_station_id and start_station_id columns from April to November to be its type changed into character.
divvy_2004 <- mutate(divvy_2004, end_station_id = as.character(end_station_id),
start_station_id = as.character(start_station_id))
divvy_2005 <- mutate(divvy_2005, end_station_id = as.character(end_station_id),
start_station_id = as.character(start_station_id))
divvy_2006 <- mutate(divvy_2006, end_station_id = as.character(end_station_id),
start_station_id = as.character(start_station_id))
divvy_2007 <- mutate(divvy_2007, end_station_id = as.character(end_station_id),
start_station_id = as.character(start_station_id))
divvy_2008 <- mutate(divvy_2008, end_station_id = as.character(end_station_id),
start_station_id = as.character(start_station_id))
divvy_2009 <- mutate(divvy_2009, end_station_id = as.character(end_station_id),
start_station_id = as.character(start_station_id))
divvy_2010 <- mutate(divvy_2010, end_station_id = as.character(end_station_id),
start_station_id = as.character(start_station_id))
divvy_2011 <- mutate(divvy_2011, end_station_id = as.character(end_station_id),
start_station_id = as.character(start_station_id))
Finally, lets integrate several tables into one integrated file.
divvy_2004_2103 <- bind_rows(divvy_2004,divvy_2005,divvy_2006,divvy_2007,divvy_2008,divvy_2009,divvy_2010,divvy_2011,divvy_2012,divvy_2101,divvy_2102,divvy_2103)
Let’s see if it is well-integrated. The results will show a sample of top rows of the table file, as well checking if all columns are in the correct data format.
str(divvy_2004_2103)
## 'data.frame': 3489748 obs. of 13 variables:
## $ ride_id : chr "A847FADBBC638E45" "5405B80E996FF60D" "5DD24A79A4E006F4" "2A59BBDF5CDBA725" ...
## $ rideable_type : chr "docked_bike" "docked_bike" "docked_bike" "docked_bike" ...
## $ started_at : chr "2020-04-26 17:45:14" "2020-04-17 17:08:54" "2020-04-01 17:54:13" "2020-04-07 12:50:19" ...
## $ ended_at : chr "2020-04-26 18:12:03" "2020-04-17 17:17:03" "2020-04-01 18:08:36" "2020-04-07 13:02:31" ...
## $ start_station_name: chr "Eckhart Park" "Drake Ave & Fullerton Ave" "McClurg Ct & Erie St" "California Ave & Division St" ...
## $ start_station_id : chr "86" "503" "142" "216" ...
## $ end_station_name : chr "Lincoln Ave & Diversey Pkwy" "Kosciuszko Park" "Indiana Ave & Roosevelt Rd" "Wood St & Augusta Blvd" ...
## $ end_station_id : chr "152" "499" "255" "657" ...
## $ start_lat : num 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num -87.7 -87.7 -87.6 -87.7 -87.6 ...
## $ end_lat : num 41.9 41.9 41.9 41.9 42 ...
## $ end_lng : num -87.7 -87.7 -87.6 -87.7 -87.7 ...
## $ member_casual : chr "member" "member" "member" "member" ...
We notice in table above that started_at and ended_at columns both identify as character, and We will analyze each bicycle trip based on time. Therefore, it is impossible to do any analyze to the data format and it should be changed from character to date-time format. We use strptime function for Date-time Conversion from Character.
divvy_2004_2103$started_at = strptime(divvy_2004_2103$started_at, format = "%Y-%m-%d %H:%M:%S")
divvy_2004_2103$ended_at = strptime(divvy_2004_2103$ended_at, format = "%Y-%m-%d %H:%M:%S")
str(divvy_2004_2103)
## 'data.frame': 3489748 obs. of 13 variables:
## $ ride_id : chr "A847FADBBC638E45" "5405B80E996FF60D" "5DD24A79A4E006F4" "2A59BBDF5CDBA725" ...
## $ rideable_type : chr "docked_bike" "docked_bike" "docked_bike" "docked_bike" ...
## $ started_at : POSIXlt, format: "2020-04-26 17:45:14" "2020-04-17 17:08:54" ...
## $ ended_at : POSIXlt, format: "2020-04-26 18:12:03" "2020-04-17 17:17:03" ...
## $ start_station_name: chr "Eckhart Park" "Drake Ave & Fullerton Ave" "McClurg Ct & Erie St" "California Ave & Division St" ...
## $ start_station_id : chr "86" "503" "142" "216" ...
## $ end_station_name : chr "Lincoln Ave & Diversey Pkwy" "Kosciuszko Park" "Indiana Ave & Roosevelt Rd" "Wood St & Augusta Blvd" ...
## $ end_station_id : chr "152" "499" "255" "657" ...
## $ start_lat : num 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num -87.7 -87.7 -87.6 -87.7 -87.6 ...
## $ end_lat : num 41.9 41.9 41.9 41.9 42 ...
## $ end_lng : num -87.7 -87.7 -87.6 -87.7 -87.7 ...
## $ member_casual : chr "member" "member" "member" "member" ...
summary(divvy_2004_2103)
## ride_id rideable_type started_at
## Length:3489748 Length:3489748 Min. :2020-04-01 00:00:30.00
## Class :character Class :character 1st Qu.:2020-07-14 19:38:28.00
## Mode :character Mode :character Median :2020-08-29 14:50:36.50
## Mean :2020-09-10 01:21:45.98
## 3rd Qu.:2020-10-20 18:14:13.00
## Max. :2021-03-31 23:59:08.00
##
## ended_at start_station_name start_station_id
## Min. :2020-04-01 00:10:45.00 Length:3489748 Length:3489748
## 1st Qu.:2020-07-14 20:13:07.75 Class :character Class :character
## Median :2020-08-29 15:21:13.00 Mode :character Mode :character
## Mean :2020-09-10 01:46:31.98
## 3rd Qu.:2020-10-20 18:28:46.25
## Max. :2021-04-06 11:00:11.00
##
## end_station_name end_station_id start_lat start_lng
## Length:3489748 Length:3489748 Min. :41.64 Min. :-87.87
## Class :character Class :character 1st Qu.:41.88 1st Qu.:-87.66
## Mode :character Mode :character Median :41.90 Median :-87.64
## Mean :41.90 Mean :-87.64
## 3rd Qu.:41.93 3rd Qu.:-87.63
## Max. :42.08 Max. :-87.52
##
## end_lat end_lng member_casual
## Min. :41.54 Min. :-88.07 Length:3489748
## 1st Qu.:41.88 1st Qu.:-87.66 Class :character
## Median :41.90 Median :-87.64 Mode :character
## Mean :41.90 Mean :-87.64
## 3rd Qu.:41.93 3rd Qu.:-87.63
## Max. :42.16 Max. :-87.44
## NA's :4738 NA's :4738
Now, for analysis by month, date, and day of the week, extract separately from the started_at column. We first extract dates only from the started_at column to create new date columns, and then extract months, day, and day of the week.
divvy_2004_2103$date <- as.Date(divvy_2004_2103$started_at)
divvy_2004_2103$year <- format(as.Date(divvy_2004_2103$started_at), "%Y")
divvy_2004_2103$month <- format(as.Date(divvy_2004_2103$started_at), "%m")
divvy_2004_2103$day <- format(as.Date(divvy_2004_2103$started_at), "%d")
divvy_2004_2103$day_of_week <- format(as.Date(divvy_2004_2103$started_at), "%A")
Then we find the length of each ride by Subtract the value of ended_at from the value of started_at, and create a ride_length column.
divvy_2004_2103$ride_length <- difftime(divvy_2004_2103$ended_at,divvy_2004_2103$started_at)
We need to change the format to numeric to analyze and calculate.
divvy_2004_2103$ride_length <- as.numeric(divvy_2004_2103$ride_length)
After taking a look into ride_length values, I found some values are less than zero, so I decide to remove it from the data for a better analyze
divvy_2004_2103_clean <- divvy_2004_2103[!(divvy_2004_2103$ride_length<0),]
It is almost over now. It is a final checking before extracting the organized and cleaned integrated file. NA values can not be deleted from R but it will simply excluded from Tableau visualization.
summary(divvy_2004_2103_clean)
## ride_id rideable_type started_at
## Length:3479196 Length:3479196 Min. :2020-04-01 00:00:30.00
## Class :character Class :character 1st Qu.:2020-07-14 18:48:36.50
## Mode :character Mode :character Median :2020-08-29 14:36:43.50
## Mean :2020-09-10 01:41:08.00
## 3rd Qu.:2020-10-20 20:26:31.75
## Max. :2021-03-31 23:59:08.00
##
## ended_at start_station_name start_station_id
## Min. :2020-04-01 00:10:45.00 Length:3479196 Length:3479196
## 1st Qu.:2020-07-14 19:20:26.00 Class :character Class :character
## Median :2020-08-29 15:09:12.00 Mode :character Mode :character
## Mean :2020-09-10 02:09:05.05
## 3rd Qu.:2020-10-20 20:42:57.50
## Max. :2021-04-06 11:00:11.00
##
## end_station_name end_station_id start_lat start_lng
## Length:3479196 Length:3479196 Min. :41.64 Min. :-87.87
## Class :character Class :character 1st Qu.:41.88 1st Qu.:-87.66
## Mode :character Mode :character Median :41.90 Median :-87.64
## Mean :41.90 Mean :-87.64
## 3rd Qu.:41.93 3rd Qu.:-87.63
## Max. :42.08 Max. :-87.52
##
## end_lat end_lng member_casual date
## Min. :41.54 Min. :-88.07 Length:3479196 Min. :2020-04-01
## 1st Qu.:41.88 1st Qu.:-87.66 Class :character 1st Qu.:2020-07-14
## Median :41.90 Median :-87.64 Mode :character Median :2020-08-29
## Mean :41.90 Mean :-87.64 Mean :2020-09-09
## 3rd Qu.:41.93 3rd Qu.:-87.63 3rd Qu.:2020-10-20
## Max. :42.16 Max. :-87.44 Max. :2021-03-31
## NA's :4712 NA's :4712
## year month day day_of_week
## Length:3479196 Length:3479196 Length:3479196 Length:3479196
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## ride_length
## Min. : 0
## 1st Qu.: 476
## Median : 874
## Mean : 1677
## 3rd Qu.: 1601
## Max. :3523202
##
mean_agr <- aggregate(divvy_2004_2103_clean$ride_length ~ divvy_2004_2103_clean$member_casual, FUN = mean)
colnames(mean_agr) <- c("Customer Type", "Mean Value")
median_agr <- aggregate(divvy_2004_2103_clean$ride_length ~ divvy_2004_2103_clean$member_casual, FUN = median)
colnames(median_agr) <- c("Customer Type", "Median Value")
max_agr <- aggregate(divvy_2004_2103_clean$ride_length ~ divvy_2004_2103_clean$member_casual, FUN = max)
colnames(max_agr) <- c("Customer Type", "Max Value")
min_agr <- aggregate(divvy_2004_2103_clean$ride_length ~ divvy_2004_2103_clean$member_casual, FUN = min)
colnames(min_agr) <- c("Customer Type", "Min Value")
print(mean_agr)
## Customer Type Mean Value
## 1 casual 2698.1722
## 2 member 966.9067
print(median_agr)
## Customer Type Median Value
## 1 casual 1272
## 2 member 689
print(max_agr)
## Customer Type Max Value
## 1 casual 3341033
## 2 member 3523202
print(min_agr)
## Customer Type Min Value
## 1 casual 0
## 2 member 0
divvy_2004_2103_clean$day_of_week <- ordered(divvy_2004_2103_clean$day_of_week, levels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
aggregate(divvy_2004_2103_clean$ride_length ~ divvy_2004_2103_clean$member_casual + divvy_2004_2103_clean$day_of_week, FUN = mean)
## divvy_2004_2103_clean$member_casual divvy_2004_2103_clean$day_of_week
## 1 casual Monday
## 2 member Monday
## 3 casual Tuesday
## 4 member Tuesday
## 5 casual Wednesday
## 6 member Wednesday
## 7 casual Thursday
## 8 member Thursday
## 9 casual Friday
## 10 member Friday
## 11 casual Saturday
## 12 member Saturday
## 13 casual Sunday
## 14 member Sunday
## divvy_2004_2103_clean$ride_length
## 1 2698.5630
## 2 920.2266
## 3 2429.2405
## 4 907.8393
## 5 2418.8408
## 6 918.8948
## 7 2578.1268
## 8 912.5715
## 9 2566.0303
## 10 948.1130
## 11 2817.3583
## 12 1067.6161
## 13 3044.5856
## 14 1093.0548
Finally, the cleaning process done! Lets extract the file, and then analyze and share the data in Tableau!
write.csv(divvy_2004_2103_clean, "divvy_2004_2103.csv")