Call the needed libraries and import datasets

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(janitor)

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(skimr)

divvy_2004 <- read.csv('202004-divvy-tripdata.csv')
divvy_2005 <- read.csv('202005-divvy-tripdata.csv')
divvy_2006 <- read.csv('202006-divvy-tripdata.csv')
divvy_2007 <- read.csv('202007-divvy-tripdata.csv')
divvy_2008 <- read.csv('202008-divvy-tripdata.csv')
divvy_2009 <- read.csv('202009-divvy-tripdata.csv')
divvy_2010 <- read.csv('202010-divvy-tripdata.csv')
divvy_2011 <- read.csv('202011-divvy-tripdata.csv')
divvy_2012 <- read.csv('202012-divvy-tripdata.csv')
divvy_2101 <- read.csv('202101-divvy-tripdata.csv')
divvy_2102 <- read.csv('202102-divvy-tripdata.csv')
divvy_2103 <- read.csv('202103-divvy-tripdata.csv')

Taking a look at the data table

Before merging individual files into one integrated file, I should take a look at each of the data tables and check the column names and the data format of each column. The data format in each column of every month must match; if it differs across the data tables, integration becomes impossible.

colnames(divvy_2007)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

str(divvy_2007)

## 'data.frame':    551480 obs. of  13 variables:
##  $ ride_id           : chr  "762198876D69004D" "BEC9C9FBA0D4CF1B" "D2FD8EA432C77EC1" "54AE594E20B35881" ...
##  $ rideable_type     : chr  "docked_bike" "docked_bike" "docked_bike" "docked_bike" ...
##  $ started_at        : chr  "2020-07-09 15:22:02" "2020-07-24 23:56:30" "2020-07-08 19:49:07" "2020-07-17 19:06:42" ...
##  $ ended_at          : chr  "2020-07-09 15:25:52" "2020-07-25 00:20:17" "2020-07-08 19:56:22" "2020-07-17 19:27:38" ...
##  $ start_station_name: chr  "Ritchie Ct & Banks St" "Halsted St & Roscoe St" "Lake Shore Dr & Diversey Pkwy" "LaSalle St & Illinois St" ...
##  $ start_station_id  : int  180 299 329 181 268 635 113 211 176 31 ...
##  $ end_station_name  : chr  "Wells St & Evergreen Ave" "Broadway & Ridge Ave" "Clark St & Wellington Ave" "Clark St & Armitage Ave" ...
##  $ end_station_id    : int  291 461 156 94 301 289 140 31 191 142 ...
##  $ start_lat         : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num  -87.6 -87.6 -87.6 -87.6 -87.6 ...
##  $ end_lat           : num  41.9 42 41.9 41.9 41.9 ...
##  $ end_lng           : num  -87.6 -87.7 -87.6 -87.6 -87.6 ...
##  $ member_casual     : chr  "member" "member" "casual" "casual" ...

Compare the datasets

This command (compare_df_cols) let us know if columns will successfully bind together by rows:

compare_df_cols(divvy_2004,divvy_2005,divvy_2006,divvy_2007,divvy_2008,divvy_2009,divvy_2010,divvy_2011,divvy_2012,divvy_2101,divvy_2102,divvy_2103, return = "mismatch")

##        column_name divvy_2004 divvy_2005 divvy_2006 divvy_2007 divvy_2008
## 1   end_station_id    integer    integer    integer    integer    integer
## 2 start_station_id    integer    integer    integer    integer    integer
##   divvy_2009 divvy_2010 divvy_2011 divvy_2012 divvy_2101 divvy_2102 divvy_2103
## 1    integer    integer    integer  character  character  character  character
## 2    integer    integer    integer  character  character  character  character

Using the command (compare_df_cols) that matches columns of datasets and returns the mismatched columns, we can see the column end station id and column start_station_id have different data formats in several datasets. From December 2020 to March 2021, it is a character, and before December, it is an integer. Integration is possible only when those are changed to the same data type.

Unify the Column type

If the data format is different, integration into one file is impossible, and the data format is unmatched. The ideal way is to unify all columns into same data format, the target columns are the end_station_id and start_station_id columns from April to November to be its type changed into character.

divvy_2004 <- mutate(divvy_2004, end_station_id = as.character(end_station_id),
                     start_station_id = as.character(start_station_id))
divvy_2005 <- mutate(divvy_2005, end_station_id = as.character(end_station_id),
                     start_station_id = as.character(start_station_id))
divvy_2006 <- mutate(divvy_2006, end_station_id = as.character(end_station_id),
                     start_station_id = as.character(start_station_id))
divvy_2007 <- mutate(divvy_2007, end_station_id = as.character(end_station_id),
                     start_station_id = as.character(start_station_id))
divvy_2008 <- mutate(divvy_2008, end_station_id = as.character(end_station_id),
                     start_station_id = as.character(start_station_id))
divvy_2009 <- mutate(divvy_2009, end_station_id = as.character(end_station_id),
                     start_station_id = as.character(start_station_id))
divvy_2010 <- mutate(divvy_2010, end_station_id = as.character(end_station_id),
                     start_station_id = as.character(start_station_id))
divvy_2011 <- mutate(divvy_2011, end_station_id = as.character(end_station_id),
                     start_station_id = as.character(start_station_id))

Combine tables into one integrated file

Finally, lets integrate several tables into one integrated file.

divvy_2004_2103 <- bind_rows(divvy_2004,divvy_2005,divvy_2006,divvy_2007,divvy_2008,divvy_2009,divvy_2010,divvy_2011,divvy_2012,divvy_2101,divvy_2102,divvy_2103)

Check the integrated file

Let’s see if it is well-integrated. The results will show a sample of top rows of the table file, as well checking if all columns are in the correct data format.

str(divvy_2004_2103)

## 'data.frame':    3489748 obs. of  13 variables:
##  $ ride_id           : chr  "A847FADBBC638E45" "5405B80E996FF60D" "5DD24A79A4E006F4" "2A59BBDF5CDBA725" ...
##  $ rideable_type     : chr  "docked_bike" "docked_bike" "docked_bike" "docked_bike" ...
##  $ started_at        : chr  "2020-04-26 17:45:14" "2020-04-17 17:08:54" "2020-04-01 17:54:13" "2020-04-07 12:50:19" ...
##  $ ended_at          : chr  "2020-04-26 18:12:03" "2020-04-17 17:17:03" "2020-04-01 18:08:36" "2020-04-07 13:02:31" ...
##  $ start_station_name: chr  "Eckhart Park" "Drake Ave & Fullerton Ave" "McClurg Ct & Erie St" "California Ave & Division St" ...
##  $ start_station_id  : chr  "86" "503" "142" "216" ...
##  $ end_station_name  : chr  "Lincoln Ave & Diversey Pkwy" "Kosciuszko Park" "Indiana Ave & Roosevelt Rd" "Wood St & Augusta Blvd" ...
##  $ end_station_id    : chr  "152" "499" "255" "657" ...
##  $ start_lat         : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num  -87.7 -87.7 -87.6 -87.7 -87.6 ...
##  $ end_lat           : num  41.9 41.9 41.9 41.9 42 ...
##  $ end_lng           : num  -87.7 -87.7 -87.6 -87.7 -87.7 ...
##  $ member_casual     : chr  "member" "member" "member" "member" ...

We notice in table above that started_at and ended_at columns both identify as character, and We will analyze each bicycle trip based on time. Therefore, it is impossible to do any analyze to the data format and it should be changed from character to date-time format. We use strptime function for Date-time Conversion from Character.

divvy_2004_2103$started_at = strptime(divvy_2004_2103$started_at, format = "%Y-%m-%d %H:%M:%S")
divvy_2004_2103$ended_at = strptime(divvy_2004_2103$ended_at, format = "%Y-%m-%d %H:%M:%S")

Check again if all columns are in the correct data format

str(divvy_2004_2103)

## 'data.frame':    3489748 obs. of  13 variables:
##  $ ride_id           : chr  "A847FADBBC638E45" "5405B80E996FF60D" "5DD24A79A4E006F4" "2A59BBDF5CDBA725" ...
##  $ rideable_type     : chr  "docked_bike" "docked_bike" "docked_bike" "docked_bike" ...
##  $ started_at        : POSIXlt, format: "2020-04-26 17:45:14" "2020-04-17 17:08:54" ...
##  $ ended_at          : POSIXlt, format: "2020-04-26 18:12:03" "2020-04-17 17:17:03" ...
##  $ start_station_name: chr  "Eckhart Park" "Drake Ave & Fullerton Ave" "McClurg Ct & Erie St" "California Ave & Division St" ...
##  $ start_station_id  : chr  "86" "503" "142" "216" ...
##  $ end_station_name  : chr  "Lincoln Ave & Diversey Pkwy" "Kosciuszko Park" "Indiana Ave & Roosevelt Rd" "Wood St & Augusta Blvd" ...
##  $ end_station_id    : chr  "152" "499" "255" "657" ...
##  $ start_lat         : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num  -87.7 -87.7 -87.6 -87.7 -87.6 ...
##  $ end_lat           : num  41.9 41.9 41.9 41.9 42 ...
##  $ end_lng           : num  -87.7 -87.7 -87.6 -87.7 -87.7 ...
##  $ member_casual     : chr  "member" "member" "member" "member" ...

summary(divvy_2004_2103)

##    ride_id          rideable_type        started_at                    
##  Length:3489748     Length:3489748     Min.   :2020-04-01 00:00:30.00  
##  Class :character   Class :character   1st Qu.:2020-07-14 19:38:28.00  
##  Mode  :character   Mode  :character   Median :2020-08-29 14:50:36.50  
##                                        Mean   :2020-09-10 01:21:45.98  
##                                        3rd Qu.:2020-10-20 18:14:13.00  
##                                        Max.   :2021-03-31 23:59:08.00  
##                                                                        
##     ended_at                      start_station_name start_station_id  
##  Min.   :2020-04-01 00:10:45.00   Length:3489748     Length:3489748    
##  1st Qu.:2020-07-14 20:13:07.75   Class :character   Class :character  
##  Median :2020-08-29 15:21:13.00   Mode  :character   Mode  :character  
##  Mean   :2020-09-10 01:46:31.98                                        
##  3rd Qu.:2020-10-20 18:28:46.25                                        
##  Max.   :2021-04-06 11:00:11.00                                        
##                                                                        
##  end_station_name   end_station_id       start_lat       start_lng     
##  Length:3489748     Length:3489748     Min.   :41.64   Min.   :-87.87  
##  Class :character   Class :character   1st Qu.:41.88   1st Qu.:-87.66  
##  Mode  :character   Mode  :character   Median :41.90   Median :-87.64  
##                                        Mean   :41.90   Mean   :-87.64  
##                                        3rd Qu.:41.93   3rd Qu.:-87.63  
##                                        Max.   :42.08   Max.   :-87.52  
##                                                                        
##     end_lat         end_lng       member_casual     
##  Min.   :41.54   Min.   :-88.07   Length:3489748    
##  1st Qu.:41.88   1st Qu.:-87.66   Class :character  
##  Median :41.90   Median :-87.64   Mode  :character  
##  Mean   :41.90   Mean   :-87.64                     
##  3rd Qu.:41.93   3rd Qu.:-87.63                     
##  Max.   :42.16   Max.   :-87.44                     
##  NA's   :4738    NA's   :4738

Create New columns such as date, year, month, day, day_of_Week and ride_length

Now, for analysis by month, date, and day of the week, extract separately from the started_at column. We first extract dates only from the started_at column to create new date columns, and then extract months, day, and day of the week.

divvy_2004_2103$date <- as.Date(divvy_2004_2103$started_at)
divvy_2004_2103$year <- format(as.Date(divvy_2004_2103$started_at), "%Y")
divvy_2004_2103$month <- format(as.Date(divvy_2004_2103$started_at), "%m")
divvy_2004_2103$day <- format(as.Date(divvy_2004_2103$started_at), "%d")
divvy_2004_2103$day_of_week <- format(as.Date(divvy_2004_2103$started_at), "%A")

Then we find the length of each ride by Subtract the value of ended_at from the value of started_at, and create a ride_length column.

divvy_2004_2103$ride_length <- difftime(divvy_2004_2103$ended_at,divvy_2004_2103$started_at)

Changing the data format of ride_length to ‘numeric’

We need to change the format to numeric to analyze and calculate.

divvy_2004_2103$ride_length <- as.numeric(divvy_2004_2103$ride_length)

Remove wrong values

After taking a look into ride_length values, I found some values are less than zero, so I decide to remove it from the data for a better analyze

 divvy_2004_2103_clean <- divvy_2004_2103[!(divvy_2004_2103$ride_length<0),]

Final inspection

It is almost over now. It is a final checking before extracting the organized and cleaned integrated file. NA values can not be deleted from R but it will simply excluded from Tableau visualization.

summary(divvy_2004_2103_clean)

##    ride_id          rideable_type        started_at                    
##  Length:3479196     Length:3479196     Min.   :2020-04-01 00:00:30.00  
##  Class :character   Class :character   1st Qu.:2020-07-14 18:48:36.50  
##  Mode  :character   Mode  :character   Median :2020-08-29 14:36:43.50  
##                                        Mean   :2020-09-10 01:41:08.00  
##                                        3rd Qu.:2020-10-20 20:26:31.75  
##                                        Max.   :2021-03-31 23:59:08.00  
##                                                                        
##     ended_at                      start_station_name start_station_id  
##  Min.   :2020-04-01 00:10:45.00   Length:3479196     Length:3479196    
##  1st Qu.:2020-07-14 19:20:26.00   Class :character   Class :character  
##  Median :2020-08-29 15:09:12.00   Mode  :character   Mode  :character  
##  Mean   :2020-09-10 02:09:05.05                                        
##  3rd Qu.:2020-10-20 20:42:57.50                                        
##  Max.   :2021-04-06 11:00:11.00                                        
##                                                                        
##  end_station_name   end_station_id       start_lat       start_lng     
##  Length:3479196     Length:3479196     Min.   :41.64   Min.   :-87.87  
##  Class :character   Class :character   1st Qu.:41.88   1st Qu.:-87.66  
##  Mode  :character   Mode  :character   Median :41.90   Median :-87.64  
##                                        Mean   :41.90   Mean   :-87.64  
##                                        3rd Qu.:41.93   3rd Qu.:-87.63  
##                                        Max.   :42.08   Max.   :-87.52  
##                                                                        
##     end_lat         end_lng       member_casual           date           
##  Min.   :41.54   Min.   :-88.07   Length:3479196     Min.   :2020-04-01  
##  1st Qu.:41.88   1st Qu.:-87.66   Class :character   1st Qu.:2020-07-14  
##  Median :41.90   Median :-87.64   Mode  :character   Median :2020-08-29  
##  Mean   :41.90   Mean   :-87.64                      Mean   :2020-09-09  
##  3rd Qu.:41.93   3rd Qu.:-87.63                      3rd Qu.:2020-10-20  
##  Max.   :42.16   Max.   :-87.44                      Max.   :2021-03-31  
##  NA's   :4712    NA's   :4712                                            
##      year              month               day            day_of_week       
##  Length:3479196     Length:3479196     Length:3479196     Length:3479196    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   ride_length     
##  Min.   :      0  
##  1st Qu.:    476  
##  Median :    874  
##  Mean   :   1677  
##  3rd Qu.:   1601  
##  Max.   :3523202  
##

Compare members and casual users

mean_agr <- aggregate(divvy_2004_2103_clean$ride_length ~ divvy_2004_2103_clean$member_casual, FUN = mean)
colnames(mean_agr) <- c("Customer Type", "Mean Value")
median_agr <- aggregate(divvy_2004_2103_clean$ride_length ~ divvy_2004_2103_clean$member_casual, FUN = median)
colnames(median_agr) <- c("Customer Type", "Median Value")
max_agr <- aggregate(divvy_2004_2103_clean$ride_length ~ divvy_2004_2103_clean$member_casual, FUN = max)
colnames(max_agr) <- c("Customer Type", "Max Value")
min_agr <- aggregate(divvy_2004_2103_clean$ride_length ~ divvy_2004_2103_clean$member_casual, FUN = min)
colnames(min_agr) <- c("Customer Type", "Min Value")

print(mean_agr)

##   Customer Type Mean Value
## 1        casual  2698.1722
## 2        member   966.9067

print(median_agr)

##   Customer Type Median Value
## 1        casual         1272
## 2        member          689

print(max_agr)

##   Customer Type Max Value
## 1        casual   3341033
## 2        member   3523202

print(min_agr)

##   Customer Type Min Value
## 1        casual         0
## 2        member         0

The days of the week should be ordered correctly

divvy_2004_2103_clean$day_of_week <- ordered(divvy_2004_2103_clean$day_of_week, levels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))

Check the average ride duration by day for members vs casual users

aggregate(divvy_2004_2103_clean$ride_length ~ divvy_2004_2103_clean$member_casual + divvy_2004_2103_clean$day_of_week, FUN = mean)

##    divvy_2004_2103_clean$member_casual divvy_2004_2103_clean$day_of_week
## 1                               casual                            Monday
## 2                               member                            Monday
## 3                               casual                           Tuesday
## 4                               member                           Tuesday
## 5                               casual                         Wednesday
## 6                               member                         Wednesday
## 7                               casual                          Thursday
## 8                               member                          Thursday
## 9                               casual                            Friday
## 10                              member                            Friday
## 11                              casual                          Saturday
## 12                              member                          Saturday
## 13                              casual                            Sunday
## 14                              member                            Sunday
##    divvy_2004_2103_clean$ride_length
## 1                          2698.5630
## 2                           920.2266
## 3                          2429.2405
## 4                           907.8393
## 5                          2418.8408
## 6                           918.8948
## 7                          2578.1268
## 8                           912.5715
## 9                          2566.0303
## 10                          948.1130
## 11                         2817.3583
## 12                         1067.6161
## 13                         3044.5856
## 14                         1093.0548

File extraction.

Finally, the cleaning process done! Lets extract the file, and then analyze and share the data in Tableau!

write.csv(divvy_2004_2103_clean, "divvy_2004_2103.csv")

Google Data Analytics - Capstone: Project case 1

Taha Nawfal

2022-10-28