Title: | Electric Vehicle Charging Sessions Profiling and Modelling |
---|---|
Description: | Tools for modelling electric vehicle charging sessions into generic groups with similar connection patterns called "user profiles", using Gaussian Mixture Models clustering. The clustering and profiling methodology is described in Cañigueral and Meléndez (2021, ISBN:0142-0615) <doi:10.1016/j.ijepes.2021.107195>. |
Authors: | Marc Cañigueral [aut, cre, cph] |
Maintainer: | Marc Cañigueral <[email protected]> |
License: | GPL-3 |
Version: | 1.1.2 |
Built: | 2024-11-23 04:34:36 UTC |
Source: | https://github.com/mcanigueral/evprof |
The Baysian Information Criterion (BIC) is the value of the maximized loglikelihood with a penalty on the number of parameters in the model, and allows comparison of models with differing parameterizations and/or differing numbers of clusters. In general the larger the value of the BIC, the stronger the evidence for the model and number of clusters (see, e.g. Fraley and Raftery 2002a).
choose_k_GMM( sessions, k, mclust_tol = 1e-08, mclust_itmax = 10000, log = FALSE, start = getOption("evprof.start.hour") )
choose_k_GMM( sessions, k, mclust_tol = 1e-08, mclust_itmax = 10000, log = FALSE, start = getOption("evprof.start.hour") )
sessions |
tibble, sessions data set in evprof standard format. |
k |
sequence with the number of clusters, for example 1:10, for 1 to 10 clusters. |
mclust_tol |
tolerance parameter for clustering |
mclust_itmax |
maximum number of iterations |
log |
logical, whether to transform |
start |
integer, start hour in the x axis of the plot. |
BIC plot
choose_k_GMM(california_ev_sessions, k = 1:4, start = 3)
choose_k_GMM(california_ev_sessions, k = 1:4, start = 3)
mclust
packageCluster sessions with mclust
package
cluster_sessions( sessions, k, seed, mclust_tol = 1e-08, mclust_itmax = 10000, log = FALSE, start = getOption("evprof.start.hour") )
cluster_sessions( sessions, k, seed, mclust_tol = 1e-08, mclust_itmax = 10000, log = FALSE, start = getOption("evprof.start.hour") )
sessions |
tibble, sessions data set in evprof standard format. |
k |
number of clusters |
seed |
random seed |
mclust_tol |
tolerance parameter for clustering |
mclust_itmax |
maximum number of iterations |
log |
logical, whether to transform |
start |
integer, start hour in the x axis of the plot. |
list with two attributes: sessions and models
library(dplyr) # Select working day sessions (`Timecycle == 1`) that # disconnect the same day (`Disconnection == 1`) sessions_day <- california_ev_sessions %>% divide_by_timecycle( months_cycles = list(1:12), # Not differentiation between months wdays_cycles = list(1:5, 6:7) # Differentiation between workdays/weekends ) %>% divide_by_disconnection( division_hour = 10, start = 3 ) %>% filter( Disconnection == 1, Timecycle == 1 ) %>% sample_frac(0.05) plot_points(sessions_day, start = 3) # Identify two clusters sessions_clusters <- cluster_sessions( sessions_day, k=2, seed = 1234, log = TRUE ) # The column `Cluster` has been added names(sessions_clusters$sessions) plot_points(sessions_clusters$sessions) + ggplot2::aes(color = Cluster)
library(dplyr) # Select working day sessions (`Timecycle == 1`) that # disconnect the same day (`Disconnection == 1`) sessions_day <- california_ev_sessions %>% divide_by_timecycle( months_cycles = list(1:12), # Not differentiation between months wdays_cycles = list(1:5, 6:7) # Differentiation between workdays/weekends ) %>% divide_by_disconnection( division_hour = 10, start = 3 ) %>% filter( Disconnection == 1, Timecycle == 1 ) %>% sample_frac(0.05) plot_points(sessions_day, start = 3) # Identify two clusters sessions_clusters <- cluster_sessions( sessions_day, k=2, seed = 1234, log = TRUE ) # The column `Cluster` has been added names(sessions_clusters$sessions) plot_points(sessions_clusters$sessions) + ggplot2::aes(color = Cluster)
Cut outliers based on minimum and maximum limits of ConnectionHours and ConnectionStartDateTime variables
cut_sessions( sessions, connection_hours_min = NA, connection_hours_max = NA, connection_start_min = NA, connection_start_max = NA, log = FALSE, start = getOption("evprof.start.hour") )
cut_sessions( sessions, connection_hours_min = NA, connection_hours_max = NA, connection_start_min = NA, connection_start_max = NA, log = FALSE, start = getOption("evprof.start.hour") )
sessions |
tibble, sessions data set in evprof standard format. |
connection_hours_min |
numeric, minimum of connection hours (duration). If NA the minimum value is considered. |
connection_hours_max |
numeric, maximum of connection hours (duration). If NA the maximum value is considered. |
connection_start_min |
numeric, minimum hour of connection start (hour as numeric). If NA the minimum value is considered. |
connection_start_max |
numeric, maximum hour of connection start (hour as numeric). If NA the maximum value is considered. |
log |
logical, whether to transform |
start |
integer, start hour in the x axis of the plot. |
session dataframe
library(dplyr) # Localize the outlying sessions above a certain threshold california_ev_sessions %>% sample_frac(0.05) %>% plot_points(start = 3) # For example sessions that start before 5 AM or that are # longer than 20 hours are considered outliers sessions_clean <- california_ev_sessions %>% sample_frac(0.05) %>% cut_sessions( start = 3, connection_hours_max = 20, connection_start_min = 5 ) plot_points(sessions_clean, start = 3)
library(dplyr) # Localize the outlying sessions above a certain threshold california_ev_sessions %>% sample_frac(0.05) %>% plot_points(start = 3) # For example sessions that start before 5 AM or that are # longer than 20 hours are considered outliers sessions_clean <- california_ev_sessions %>% sample_frac(0.05) %>% cut_sessions( start = 3, connection_hours_max = 20, connection_start_min = 5 ) plot_points(sessions_clean, start = 3)
Every cluster has a centroid (i.e. average start time and duration) that can be related to a daily human behaviour or connection pattern (e.g. Worktime, Dinner, etc.). In this function, a user profile name is assigned to every cluster.
define_clusters( models, interpretations = NULL, profile_names = NULL, log = FALSE )
define_clusters( models, interpretations = NULL, profile_names = NULL, log = FALSE )
models |
tibble, parameters of the clusters' GMM models obtained with
function |
interpretations |
character vector with interpretation sentences of each cluster (arranged by cluster number) |
profile_names |
character vector with user profile assigned to each cluster (arranged by cluster number) |
log |
logical, whether to transform |
tibble object
library(dplyr) # Select working day sessions (`Timecycle == 1`) that # disconnect the same day (`Disconnection == 1`) sessions_day <- california_ev_sessions %>% divide_by_timecycle( months_cycles = list(1:12), # Not differentiation between months wdays_cycles = list(1:5, 6:7) # Differentiation between workdays/weekends ) %>% divide_by_disconnection( division_hour = 10, start = 3 ) %>% filter( Disconnection == 1, Timecycle == 1 ) %>% sample_frac(0.05) plot_points(sessions_day, start = 3) # Identify two clusters sessions_clusters <- cluster_sessions( sessions_day, k=2, seed = 1234, log = TRUE ) # Plot the clusters found plot_bivarGMM( sessions = sessions_clusters$sessions, models = sessions_clusters$models, log = TRUE, start = 3 ) # Define the clusters with user profile interpretations define_clusters( models = sessions_clusters$models, interpretations = c( "Connections during working hours", "Connections during all day (high variability)" ), profile_names = c("Workers", "Visitors"), log = TRUE )
library(dplyr) # Select working day sessions (`Timecycle == 1`) that # disconnect the same day (`Disconnection == 1`) sessions_day <- california_ev_sessions %>% divide_by_timecycle( months_cycles = list(1:12), # Not differentiation between months wdays_cycles = list(1:5, 6:7) # Differentiation between workdays/weekends ) %>% divide_by_disconnection( division_hour = 10, start = 3 ) %>% filter( Disconnection == 1, Timecycle == 1 ) %>% sample_frac(0.05) plot_points(sessions_day, start = 3) # Identify two clusters sessions_clusters <- cluster_sessions( sessions_day, k=2, seed = 1234, log = TRUE ) # Plot the clusters found plot_bivarGMM( sessions = sessions_clusters$sessions, models = sessions_clusters$models, log = TRUE, start = 3 ) # Define the clusters with user profile interpretations define_clusters( models = sessions_clusters$models, interpretations = c( "Connections during working hours", "Connections during all day (high variability)" ), profile_names = c("Workers", "Visitors"), log = TRUE )
Detect outliers
detect_outliers( sessions, MinPts = NULL, eps = NULL, noise_th = 2, log = FALSE, start = getOption("evprof.start.hour") )
detect_outliers( sessions, MinPts = NULL, eps = NULL, noise_th = 2, log = FALSE, start = getOption("evprof.start.hour") )
sessions |
tibble, sessions data set in evprof standard format. |
MinPts |
MinPts parameter for DBSCAN clustering |
eps |
eps parameter for DBSCAN clustering |
noise_th |
noise threshold |
log |
logical, whether to transform |
start |
integer, start hour in the x axis of the plot. |
sessions tibble with extra boolean column Outlier
library(dplyr) sessions_outliers <- california_ev_sessions %>% sample_frac(0.05) %>% detect_outliers(start = 3, noise_th = 5, eps = 2.5)
library(dplyr) sessions_outliers <- california_ev_sessions %>% sample_frac(0.05) %>% detect_outliers(start = 3, noise_th = 5, eps = 2.5)
Divide sessions by disconnection day
divide_by_disconnection( sessions, division_hour, start = getOption("evprof.start.hour") )
divide_by_disconnection( sessions, division_hour, start = getOption("evprof.start.hour") )
sessions |
tibble, sessions data set in evprof standard format. |
division_hour |
Hour to divide the groups according to disconnection time |
start |
integer, start hour in the x axis of the plot. |
same sessions data set with extra column "Disconnection"
library(dplyr) sessions_disconnection <- california_ev_sessions %>% sample_frac(0.05) %>% divide_by_disconnection( start = 2, division_hour = 5 ) # The column `Disconnection` has been added names(sessions_disconnection) library(ggplot2) sessions_disconnection %>% tidyr::drop_na() %>% plot_points() + facet_wrap(vars(Disconnection))
library(dplyr) sessions_disconnection <- california_ev_sessions %>% sample_frac(0.05) %>% divide_by_disconnection( start = 2, division_hour = 5 ) # The column `Disconnection` has been added names(sessions_disconnection) library(ggplot2) sessions_disconnection %>% tidyr::drop_na() %>% plot_points() + facet_wrap(vars(Disconnection))
Divide sessions by time-cycle
divide_by_timecycle( sessions, months_cycles = list(1:12), wdays_cycles = list(1:5, 6:7), start = getOption("evprof.start.hour") )
divide_by_timecycle( sessions, months_cycles = list(1:12), wdays_cycles = list(1:5, 6:7), start = getOption("evprof.start.hour") )
sessions |
tibble, sessions data set in evprof standard format. |
months_cycles |
list containing Monthly cycles |
wdays_cycles |
list containing Weekdays cycles |
start |
integer, start hour in the x axis of the plot. |
same sessions data set with extra column "Timecycle"
library(dplyr) sessions_timecycles <- california_ev_sessions %>% sample_frac(0.05) %>% divide_by_timecycle( months_cycles = list(1:12), wdays_cycles = list(1:5, 6:7) ) # The column `Timecycle` has been added names(sessions_timecycles) library(ggplot2) plot_points(sessions_timecycles) + facet_wrap(vars(Timecycle))
library(dplyr) sessions_timecycles <- california_ev_sessions %>% sample_frac(0.05) %>% divide_by_timecycle( months_cycles = list(1:12), wdays_cycles = list(1:5, 6:7) ) # The column `Timecycle` has been added names(sessions_timecycles) library(ggplot2) plot_points(sessions_timecycles) + facet_wrap(vars(Timecycle))
Drop outliers
drop_outliers(sessions)
drop_outliers(sessions)
sessions |
tibble, sessions data set in evprof standard format. |
sessions without outliers nor column Outlier
library(dplyr) sessions_outliers <- california_ev_sessions %>% sample_frac(0.05) %>% detect_outliers(start = 3, noise_th = 5, eps = 2.5) plot_outliers(sessions_outliers, start = 3) sessions_clean <- drop_outliers(sessions_outliers) plot_points(sessions_clean, start = 3)
library(dplyr) sessions_outliers <- california_ev_sessions %>% sample_frac(0.05) %>% detect_outliers(start = 3, noise_th = 5, eps = 2.5) plot_outliers(sessions_outliers, start = 3) sessions_clean <- drop_outliers(sessions_outliers) plot_points(sessions_clean, start = 3)
Get charging rates distribution in percentages
get_charging_rates_distribution(sessions, unit = "year")
get_charging_rates_distribution(sessions, unit = "year")
sessions |
tibble, sessions data set in evprof standard format. |
unit |
character, lubridate |
tibble
get_charging_rates_distribution(california_ev_sessions, unit="month") get_charging_rates_distribution(california_ev_sessions, unit="month")
get_charging_rates_distribution(california_ev_sessions, unit="month") get_charging_rates_distribution(california_ev_sessions, unit="month")
Get a tibble of connection GMM for every user profile
get_connection_models( subsets_clustering = list(), clusters_definition = list() )
get_connection_models( subsets_clustering = list(), clusters_definition = list() )
subsets_clustering |
list with clustering results of each subset
(direct output from function |
clusters_definition |
list of tibbles with clusters definitions
(direct output from function |
tibble
library(dplyr) # Select working day sessions (`Timecycle == 1`) that # disconnect the same day (`Disconnection == 1`) sessions_day <- california_ev_sessions %>% divide_by_timecycle( months_cycles = list(1:12), # Not differentiation between months wdays_cycles = list(1:5, 6:7) # Differentiation between workdays/weekends ) %>% divide_by_disconnection( division_hour = 10, start = 3 ) %>% filter( Disconnection == 1, Timecycle == 1 ) %>% sample_frac(0.05) plot_points(sessions_day, start = 3) # Identify two clusters sessions_clusters <- cluster_sessions( sessions_day, k=2, seed = 1234, log = TRUE ) # Plot the clusters found plot_bivarGMM( sessions = sessions_clusters$sessions, models = sessions_clusters$models, log = TRUE, start = 3 ) # Define the clusters with user profile interpretations clusters_definitions <- define_clusters( models = sessions_clusters$models, interpretations = c( "Connections during working hours", "Connections during all day (high variability)" ), profile_names = c("Workers", "Visitors"), log = TRUE ) # Create a table with the connection GMM parameters get_connection_models( subsets_clustering = list(sessions_clusters), clusters_definition = list(clusters_definitions) )
library(dplyr) # Select working day sessions (`Timecycle == 1`) that # disconnect the same day (`Disconnection == 1`) sessions_day <- california_ev_sessions %>% divide_by_timecycle( months_cycles = list(1:12), # Not differentiation between months wdays_cycles = list(1:5, 6:7) # Differentiation between workdays/weekends ) %>% divide_by_disconnection( division_hour = 10, start = 3 ) %>% filter( Disconnection == 1, Timecycle == 1 ) %>% sample_frac(0.05) plot_points(sessions_day, start = 3) # Identify two clusters sessions_clusters <- cluster_sessions( sessions_day, k=2, seed = 1234, log = TRUE ) # Plot the clusters found plot_bivarGMM( sessions = sessions_clusters$sessions, models = sessions_clusters$models, log = TRUE, start = 3 ) # Define the clusters with user profile interpretations clusters_definitions <- define_clusters( models = sessions_clusters$models, interpretations = c( "Connections during working hours", "Connections during all day (high variability)" ), profile_names = c("Workers", "Visitors"), log = TRUE ) # Create a table with the connection GMM parameters get_connection_models( subsets_clustering = list(sessions_clusters), clusters_definition = list(clusters_definitions) )
Get the daily average number of sessions given a range of years, months and weekdays
get_daily_avg_n_sessions(sessions, years, months, wdays)
get_daily_avg_n_sessions(sessions, years, months, wdays)
sessions |
tibble, sessions data set in evprof standard format. |
years |
vector of integers, range of years to consider |
months |
vector of integers, range of months to consider |
wdays |
vector of integers, range of weekdays to consider |
tibble with the number of sessions of each date in the given time period
get_daily_avg_n_sessions( california_ev_sessions, year = 2018, months = c(5, 6), wdays = 1 )
get_daily_avg_n_sessions( california_ev_sessions, year = 2018, months = c(5, 6), wdays = 1 )
Get daily number of sessions given a range of years, months and weekdays
get_daily_n_sessions(sessions, years, months, wdays)
get_daily_n_sessions(sessions, years, months, wdays)
sessions |
tibble, sessions data set in evprof standard format. |
years |
vector of integers, range of years to consider |
months |
vector of integers, range of months to consider |
wdays |
vector of integers, range of weekdays to consider |
tibble with the number of sessions of each date in the given time period
get_daily_n_sessions( california_ev_sessions, year = 2018, months = c(5, 6), wdays = 1 )
get_daily_n_sessions( california_ev_sessions, year = 2018, months = c(5, 6), wdays = 1 )
Get the minPts and eps values for DBSCAN to label only a specific percentage as noise
get_dbscan_params( sessions, MinPts, eps0, noise_th = 2, eps_offset_pct = 0.9, eps_inc_pct = 0.02, log = FALSE, start = getOption("evprof.start.hour") )
get_dbscan_params( sessions, MinPts, eps0, noise_th = 2, eps_offset_pct = 0.9, eps_inc_pct = 0.02, log = FALSE, start = getOption("evprof.start.hour") )
sessions |
tibble, sessions data set in evprof standard format. |
MinPts |
DBSCAN MinPts parameter |
eps0 |
DBSCAN eps parameter corresponding to the elbow of kNN dist plot |
noise_th |
noise threshold |
eps_offset_pct |
eps_offset_pct |
eps_inc_pct |
eps_inc_pct |
log |
logical, whether to transform |
start |
integer, start hour in the x axis of the plot. |
tibble with minPts and eps parameters, and the corresponding noise
This function simulates random energy values, makes the density curve and overlaps the simulated density curve with the real density curve of the user profile's energy values. This is useful to appreciate how the modeled values fit the real ones and increase or decrease the number of Gaussian components.
get_energy_models(sessions_profiles, log = TRUE, by_power = FALSE)
get_energy_models(sessions_profiles, log = TRUE, by_power = FALSE)
sessions_profiles |
tibble, sessions data set in evprof
standard format
with user profile attribute |
log |
logical, whether to transform |
by_power |
Logical, true to fit the energy models for every charging rate separately |
tibble
library(dplyr) # Classify each session to the corresponding user profile sessions_profiles <- california_ev_sessions_profiles %>% dplyr::sample_frac(0.05) # Get a table with the energy GMM parameters get_energy_models(sessions_profiles, log = TRUE) # If there is a `Power` variable in the data set # you can create an energy model per power rate and user profile # First it is convenient to round the `Power` values for more generic models sessions_profiles <- sessions_profiles %>% mutate(Power = round_to_interval(Power, 3.7)) %>% filter(Power < 11) sessions_profiles$Power[sessions_profiles$Power == 0] <- 3.7 get_energy_models(sessions_profiles, log = TRUE, by_power = TRUE)
library(dplyr) # Classify each session to the corresponding user profile sessions_profiles <- california_ev_sessions_profiles %>% dplyr::sample_frac(0.05) # Get a table with the energy GMM parameters get_energy_models(sessions_profiles, log = TRUE) # If there is a `Power` variable in the data set # you can create an energy model per power rate and user profile # First it is convenient to round the `Power` values for more generic models sessions_profiles <- sessions_profiles %>% mutate(Power = round_to_interval(Power, 3.7)) %>% filter(Power < 11) sessions_profiles$Power[sessions_profiles$Power == 0] <- 3.7 get_energy_models(sessions_profiles, log = TRUE, by_power = TRUE)
evmodel
Get the EV model object of class evmodel
get_ev_model( names, months_lst = list(1:12, 1:12), wdays_lst = list(1:5, 6:7), connection_GMM, energy_GMM, connection_log, energy_log, data_tz )
get_ev_model( names, months_lst = list(1:12, 1:12), wdays_lst = list(1:5, 6:7), connection_GMM, energy_GMM, connection_log, energy_log, data_tz )
names |
character vector with the given names of each time-cycle model |
months_lst |
list of integer vectors with the corresponding months of the year for each time-cycle model |
wdays_lst |
list of integer vectors with the corresponding days of the week for each model (week start = 1) |
connection_GMM |
list of different connection bivariate GMM obtained from |
energy_GMM |
list of different energy univariate GMM obtained from |
connection_log |
logical, true if connection models have logarithmic transformations |
energy_log |
logical, true if energy models have logarithmic transformations |
data_tz |
character, time zone of the original data (necessary to properly simulate new sessions) |
object of class evmodel
# The package evprof provides example objects of connection and energy # Gaussian Mixture Models obtained from California's open data set # (see California article in package website) created with functions # `get_connection models` and `get_energy models`. # For workdays sessions workdays_connection_models <- evprof::california_GMM$workdays$connection_models workdays_energy_models <- evprof::california_GMM$workdays$energy_models # For weekends sessions weekends_connection_models <- evprof::california_GMM$weekends$connection_models weekends_energy_models <- evprof::california_GMM$weekends$energy_models # Get the whole model ev_model <- get_ev_model( names = c("Workdays", "Weekends"), months_lst = list(1:12, 1:12), wdays_lst = list(1:5, 6:7), connection_GMM = list(workdays_connection_models, weekends_connection_models), energy_GMM = list(workdays_energy_models, weekends_energy_models), connection_log = TRUE, energy_log = TRUE, data_tz = "America/Los_Angeles" )
# The package evprof provides example objects of connection and energy # Gaussian Mixture Models obtained from California's open data set # (see California article in package website) created with functions # `get_connection models` and `get_energy models`. # For workdays sessions workdays_connection_models <- evprof::california_GMM$workdays$connection_models workdays_energy_models <- evprof::california_GMM$workdays$energy_models # For weekends sessions weekends_connection_models <- evprof::california_GMM$weekends$connection_models weekends_energy_models <- evprof::california_GMM$weekends$energy_models # Get the whole model ev_model <- get_ev_model( names = c("Workdays", "Weekends"), months_lst = list(1:12, 1:12), wdays_lst = list(1:5, 6:7), connection_GMM = list(workdays_connection_models, weekends_connection_models), energy_GMM = list(workdays_energy_models, weekends_energy_models), connection_log = TRUE, energy_log = TRUE, data_tz = "America/Los_Angeles" )
Plot Bivariate Gaussian Mixture Models
plot_bivarGMM( sessions, models, profiles_names = seq(1, nrow(models)), points_size = 0.25, lines_size = 1, legend_nrow = 2, log = FALSE, start = getOption("evprof.start.hour") )
plot_bivarGMM( sessions, models, profiles_names = seq(1, nrow(models)), points_size = 0.25, lines_size = 1, legend_nrow = 2, log = FALSE, start = getOption("evprof.start.hour") )
sessions |
tibble, sessions data set in evprof standard format. |
models |
tibble, parameters of the clusters' GMM models obtained with
function |
profiles_names |
names of profiles |
points_size |
size of scatter points in the plot |
lines_size |
size of lines in the plot |
legend_nrow |
number of rows in legend |
log |
logical, whether to transform |
start |
integer, start hour in the x axis of the plot. |
ggplot2 plot
library(dplyr) # Select working day sessions (`Timecycle == 1`) that # disconnect the same day (`Disconnection == 1`) sessions_day <- california_ev_sessions %>% divide_by_timecycle( months_cycles = list(1:12), # Not differentiation between months wdays_cycles = list(1:5, 6:7) # Differentiation between workdays/weekends ) %>% divide_by_disconnection( division_hour = 10, start = 3 ) %>% filter( Disconnection == 1, Timecycle == 1 ) %>% sample_frac(0.05) plot_points(sessions_day, start = 3) # Identify two clusters sessions_clusters <- cluster_sessions( sessions_day, k=2, seed = 1234, log = TRUE ) # Plot the clusters found plot_bivarGMM( sessions = sessions_clusters$sessions, models = sessions_clusters$models, log = TRUE, start = 3 )
library(dplyr) # Select working day sessions (`Timecycle == 1`) that # disconnect the same day (`Disconnection == 1`) sessions_day <- california_ev_sessions %>% divide_by_timecycle( months_cycles = list(1:12), # Not differentiation between months wdays_cycles = list(1:5, 6:7) # Differentiation between workdays/weekends ) %>% divide_by_disconnection( division_hour = 10, start = 3 ) %>% filter( Disconnection == 1, Timecycle == 1 ) %>% sample_frac(0.05) plot_points(sessions_day, start = 3) # Identify two clusters sessions_clusters <- cluster_sessions( sessions_day, k=2, seed = 1234, log = TRUE ) # Plot the clusters found plot_bivarGMM( sessions = sessions_clusters$sessions, models = sessions_clusters$models, log = TRUE, start = 3 )
Density plot in 2D, considering Start time and Connection duration as variables
plot_density_2D( sessions, bins = 15, by = c("wday", "month", "year"), start = getOption("evprof.start.hour"), log = FALSE )
plot_density_2D( sessions, bins = 15, by = c("wday", "month", "year"), start = getOption("evprof.start.hour"), log = FALSE )
sessions |
tibble, sessions data set in evprof standard format. |
bins |
integer, parameter to pass to |
by |
variable to facet the plot. Character being "wday", "month" or "year", considering the week to start at wday=1. |
start |
integer, start hour in the x axis of the plot. |
log |
logical, whether to transform |
ggplot2 plot
library(dplyr) california_ev_sessions %>% sample_frac(0.05) %>% plot_density_2D(by = "wday", start = 3, bins = 15, log = FALSE)
library(dplyr) california_ev_sessions %>% sample_frac(0.05) %>% plot_density_2D(by = "wday", start = 3, bins = 15, log = FALSE)
Density plot in 3D, considering Start time and Connection duration as variables
plot_density_3D( sessions, start = getOption("evprof.start.hour"), eye = list(x = -1.5, y = -1.5, z = 1.5), log = FALSE )
plot_density_3D( sessions, start = getOption("evprof.start.hour"), eye = list(x = -1.5, y = -1.5, z = 1.5), log = FALSE )
sessions |
tibble, sessions data set in evprof standard format. |
start |
integer, start hour in the x axis of the plot. |
eye |
list containing x, y and z points of view. Example: |
log |
logical, whether to transform |
plotly plot (html)
library(dplyr) california_ev_sessions %>% sample_frac(0.05) %>% plot_density_3D(start = 3)
library(dplyr) california_ev_sessions %>% sample_frac(0.05) %>% plot_density_3D(start = 3)
Iteration over evprof::plot_division_line function to plot multiple lines
plot_division_lines(ggplot_points, n_lines, division_hour)
plot_division_lines(ggplot_points, n_lines, division_hour)
ggplot_points |
ggplot2 returned by evprof::plot_points function |
n_lines |
number of lines to plot |
division_hour |
Hour to divide the groups according to disconnection time |
ggplot2 function
library(dplyr) california_ev_sessions %>% sample_frac(0.05) %>% plot_points(start = 3) %>% plot_division_lines(n_lines = 1, division_hour = 5)
library(dplyr) california_ev_sessions %>% sample_frac(0.05) %>% plot_points(start = 3) %>% plot_division_lines(n_lines = 1, division_hour = 5)
Compare density of estimated energy with density of real energy vector
plot_energy_models(energy_models, nrow = 2)
plot_energy_models(energy_models, nrow = 2)
energy_models |
energy models returned by function |
nrow |
integer, number of rows in the plot grid (passed to |
ggplot
# The package evprof provides example objects of connection and energy # Gaussian Mixture Models obtained from California's open data set # (see California article in package website) created with functions # `get_connection models` and `get_energy models`. # Get the working days energy models energy_models <- evprof::california_GMM$workdays$energy_models # Plot energy models plot_energy_models(energy_models)
# The package evprof provides example objects of connection and energy # Gaussian Mixture Models obtained from California's open data set # (see California article in package website) created with functions # `get_connection models` and `get_energy models`. # Get the working days energy models energy_models <- evprof::california_GMM$workdays$energy_models # Plot energy models plot_energy_models(energy_models)
Histogram of a variable from sessions data set
plot_histogram(sessions, var, binwidth = 1)
plot_histogram(sessions, var, binwidth = 1)
sessions |
tibble, sessions data set in evprof standard format. |
var |
character, column name to compute the histogram for |
binwidth |
integer, with of histogram bins |
ggplot plot
plot_histogram(california_ev_sessions, "Power", binwidth = 2) plot_histogram(california_ev_sessions, "Power", binwidth = 0.1)
plot_histogram(california_ev_sessions, "Power", binwidth = 2) plot_histogram(california_ev_sessions, "Power", binwidth = 0.1)
Grid of multiple variable histograms
plot_histogram_grid( sessions, vars = evprof::sessions_summary_feature_names, binwidths = rep(1, length(vars)), nrow = NULL, ncol = NULL )
plot_histogram_grid( sessions, vars = evprof::sessions_summary_feature_names, binwidths = rep(1, length(vars)), nrow = NULL, ncol = NULL )
sessions |
tibble, sessions data set in evprof standard format. |
vars |
vector of characters, variables to plot |
binwidths |
vector of integers, binwidths of each variable histogram.
The length of the vector must correspond to the length of the |
nrow |
integer, number of rows of the plot grid |
ncol |
integer, number of columns of the plot grid |
grid plot
plot_histogram_grid(california_ev_sessions) plot_histogram_grid(california_ev_sessions, vars = c("Energy", "Power"))
plot_histogram_grid(california_ev_sessions) plot_histogram_grid(california_ev_sessions, vars = c("Energy", "Power"))
Plot the kNN (k-nearest neighbors) distance plot to visually detect the
"elbow" and define an appropriate value for eps
DBSCAN parameter.
plot_kNNdist( sessions, MinPts = NULL, log = FALSE, start = getOption("evprof.start.hour") )
plot_kNNdist( sessions, MinPts = NULL, log = FALSE, start = getOption("evprof.start.hour") )
sessions |
tibble, sessions data set in evprof standard format. |
MinPts |
integer, DBSCAN MinPts parameter. If null, a value of 200 will be considered. |
log |
logical, whether to transform |
start |
integer, start hour in the x axis of the plot. |
The kNN (k-nearest neighbors) distance plot can provide insights into
setting the eps
parameter in DBSCAN. The "elbow" in the kNN distance plot
is the point where the distances start to increase significantly. At the
same time, for DBSCAN, the eps parameter defines the radius within which a
specified number of points must exist for a data point to be considered a
core point. Therefore, the "elbow" of the kNN distance plot can provide a
sense of the scale of the data and help you choose a reasonable range for
the eps
parameter in DBSCAN.
plot
library(dplyr) california_ev_sessions %>% sample_frac(0.05) %>% plot_kNNdist(start = 3, log = TRUE)
library(dplyr) california_ev_sessions %>% sample_frac(0.05) %>% plot_kNNdist(start = 3, log = TRUE)
Plot all bi-variable GMM (clusters) with the colors corresponding to the assigned user profile. This shows which clusters correspond to which user profile, and the proportion of every user profile.
plot_model_clusters( subsets_clustering = list(), clusters_definition = list(), profiles_ratios, log = TRUE )
plot_model_clusters( subsets_clustering = list(), clusters_definition = list(), profiles_ratios, log = TRUE )
subsets_clustering |
list with clustering results of each subset
(direct output from function |
clusters_definition |
list of tibbles with clusters definitions
(direct output from function |
profiles_ratios |
tibble with columns |
log |
logical, whether to transform |
ggplot2
library(dplyr) # Select working day sessions (`Timecycle == 1`) that # disconnect the same day (`Disconnection == 1`) sessions_day <- evprof::california_ev_sessions_profiles %>% filter(Timecycle == "Workday") %>% sample_frac(0.05) plot_points(sessions_day, start = 3) # Identify two clusters sessions_clusters <- cluster_sessions( sessions_day, k=2, seed = 1234, log = TRUE ) # Plot the clusters found plot_bivarGMM( sessions = sessions_clusters$sessions, models = sessions_clusters$models, log = TRUE, start = 3 ) # Define the clusters with user profile interpretations clusters_definitions <- define_clusters( models = sessions_clusters$models, interpretations = c( "Connections during all day (high variability)", "Connections during working hours"#' ), profile_names = c("Visitors", "Workers"), log = TRUE ) # Create a table with the connection GMM parameters connection_models <- get_connection_models( subsets_clustering = list(sessions_clusters), clusters_definition = list(clusters_definitions) ) # Plot all bi-variable GMM (clusters) with the colors corresponding # to their assigned user profile plot_model_clusters( subsets_clustering = list(sessions_clusters), clusters_definition = list(clusters_definitions), profiles_ratios = connection_models[c("profile", "ratio")] )
library(dplyr) # Select working day sessions (`Timecycle == 1`) that # disconnect the same day (`Disconnection == 1`) sessions_day <- evprof::california_ev_sessions_profiles %>% filter(Timecycle == "Workday") %>% sample_frac(0.05) plot_points(sessions_day, start = 3) # Identify two clusters sessions_clusters <- cluster_sessions( sessions_day, k=2, seed = 1234, log = TRUE ) # Plot the clusters found plot_bivarGMM( sessions = sessions_clusters$sessions, models = sessions_clusters$models, log = TRUE, start = 3 ) # Define the clusters with user profile interpretations clusters_definitions <- define_clusters( models = sessions_clusters$models, interpretations = c( "Connections during all day (high variability)", "Connections during working hours"#' ), profile_names = c("Visitors", "Workers"), log = TRUE ) # Create a table with the connection GMM parameters connection_models <- get_connection_models( subsets_clustering = list(sessions_clusters), clusters_definition = list(clusters_definitions) ) # Plot all bi-variable GMM (clusters) with the colors corresponding # to their assigned user profile plot_model_clusters( subsets_clustering = list(sessions_clusters), clusters_definition = list(clusters_definitions), profiles_ratios = connection_models[c("profile", "ratio")] )
Plot outlying sessions
plot_outliers( sessions, start = getOption("evprof.start.hour"), log = FALSE, ... )
plot_outliers( sessions, start = getOption("evprof.start.hour"), log = FALSE, ... )
sessions |
tibble, sessions data set in evprof standard format. |
start |
integer, start hour in the x axis of the plot. |
log |
logical, whether to transform |
... |
arguments to pass to function ggplot2::plot_point |
ggplot2 plot
library(dplyr) sessions_outliers <- california_ev_sessions %>% sample_frac(0.05) %>% detect_outliers(start = 3, noise_th = 5, eps = 2.5) plot_outliers(sessions_outliers, start = 3) plot_outliers(sessions_outliers, start = 3, log = TRUE)
library(dplyr) sessions_outliers <- california_ev_sessions %>% sample_frac(0.05) %>% detect_outliers(start = 3, noise_th = 5, eps = 2.5) plot_outliers(sessions_outliers, start = 3) plot_outliers(sessions_outliers, start = 3, log = TRUE)
Scatter plot of sessions
plot_points(sessions, start = getOption("evprof.start.hour"), log = FALSE, ...)
plot_points(sessions, start = getOption("evprof.start.hour"), log = FALSE, ...)
sessions |
tibble, sessions data set in evprof standard format. |
start |
integer, start hour in the x axis of the plot. |
log |
logical, whether to transform |
... |
arguments to |
ggplot scatter plot
library(dplyr) california_ev_sessions %>% sample_frac(0.05) %>% plot_points() california_ev_sessions %>% sample_frac(0.05) %>% plot_points(start = 3) california_ev_sessions %>% sample_frac(0.05) %>% plot_points(log = TRUE)
library(dplyr) california_ev_sessions %>% sample_frac(0.05) %>% plot_points() california_ev_sessions %>% sample_frac(0.05) %>% plot_points(start = 3) california_ev_sessions %>% sample_frac(0.05) %>% plot_points(log = TRUE)
evmodel
Read an EV model JSON file and convert it to object of class evmodel
read_ev_model(file)
read_ev_model(file)
file |
path to the JSON file |
object of class evmodel
ev_model <- california_ev_model # Model of example save_ev_model(ev_model, file = file.path(tempdir(), "evmodel.json")) read_ev_model(file = file.path(tempdir(), "evmodel.json"))
ev_model <- california_ev_model # Model of example save_ev_model(ev_model, file = file.path(tempdir(), "evmodel.json")) read_ev_model(file = file.path(tempdir(), "evmodel.json"))
Round to nearest interval
round_to_interval(dbl, interval)
round_to_interval(dbl, interval)
dbl |
number to round |
interval |
rounding interval |
numeric value
set.seed(1) random_vct <- rnorm(10, 5, 5) round_to_interval(random_vct, 2.5)
set.seed(1) random_vct <- rnorm(10, 5, 5) round_to_interval(random_vct, 2.5)
Save iteration plots in PDF file
save_clustering_iterations( sessions, k, filename, it = 12, seeds = round(runif(it, min = 1, max = 1000)), plot_scale = 2, points_size = 0.25, mclust_tol = 1e-08, mclust_itmax = 10000, log = FALSE, start = getOption("evprof.start.hour") )
save_clustering_iterations( sessions, k, filename, it = 12, seeds = round(runif(it, min = 1, max = 1000)), plot_scale = 2, points_size = 0.25, mclust_tol = 1e-08, mclust_itmax = 10000, log = FALSE, start = getOption("evprof.start.hour") )
sessions |
tibble, sessions data set in evprof standard format. |
k |
number of clusters |
filename |
string defining the PDF output file path (with extension .pdf) |
it |
number of iterations |
seeds |
seed for each iteration |
plot_scale |
scale of each iteration plot for a good visualization in pdf file |
points_size |
integer, size of points in the scatter plot |
mclust_tol |
tolerance parameter for clustering |
mclust_itmax |
maximum number of iterations |
log |
logical, whether to transform |
start |
integer, start hour in the x axis of the plot. |
nothing, but a PDF file is saved in the path specified by parameter filename
temp_file <- file.path(tempdir(), "iteration.pdf") save_clustering_iterations(california_ev_sessions, k = 2, it = 4, filename = temp_file)
temp_file <- file.path(tempdir(), "iteration.pdf") save_clustering_iterations(california_ev_sessions, k = 2, it = 4, filename = temp_file)
evmodel
to a JSON fileSave the EV model object of class evmodel
to a JSON file
save_ev_model(evmodel, file)
save_ev_model(evmodel, file)
evmodel |
object of class |
file |
character string with the path or name of the file |
nothing but saves the evmodel
object in a JSON file
ev_model <- california_ev_model # Model of example save_ev_model(ev_model, file = file.path(tempdir(), "evmodel.json"))
ev_model <- california_ev_model # Model of example save_ev_model(ev_model, file = file.path(tempdir(), "evmodel.json"))
Joins all sub-sets from the list, adding a new column Profile
set_profiles(sessions_clustered = list(), clusters_definition = list())
set_profiles(sessions_clustered = list(), clusters_definition = list())
sessions_clustered |
list of tibbles with sessions clustered
( |
clusters_definition |
list of tibbles with clusters definitions
(direct output from function |
tibble
library(dplyr) # Select working day sessions (`Timecycle == 1`) that # disconnect the same day (`Disconnection == 1`) sessions_day <- california_ev_sessions %>% divide_by_timecycle( months_cycles = list(1:12), # Not differentiation between months wdays_cycles = list(1:5, 6:7) # Differentiation between workdays/weekends ) %>% divide_by_disconnection( division_hour = 10, start = 3 ) %>% filter( Disconnection == 1, Timecycle == 1 ) %>% sample_frac(0.05) # Identify two clusters sessions_clusters <- cluster_sessions( sessions_day, k=2, seed = 1234, log = TRUE ) # Plot the clusters found plot_bivarGMM( sessions = sessions_clusters$sessions, models = sessions_clusters$models, log = TRUE, start = 3 ) # Define the clusters with user profile interpretations clusters_definitions <- define_clusters( models = sessions_clusters$models, interpretations = c( "Connections during working hours", "Connections during all day (high variability)" ), profile_names = c("Workers", "Visitors"), log = TRUE ) # Classify each session to the corresponding user profile sessions_profiles <- set_profiles( sessions_clustered = list(sessions_clusters$sessions), clusters_definition = list(clusters_definitions) )
library(dplyr) # Select working day sessions (`Timecycle == 1`) that # disconnect the same day (`Disconnection == 1`) sessions_day <- california_ev_sessions %>% divide_by_timecycle( months_cycles = list(1:12), # Not differentiation between months wdays_cycles = list(1:5, 6:7) # Differentiation between workdays/weekends ) %>% divide_by_disconnection( division_hour = 10, start = 3 ) %>% filter( Disconnection == 1, Timecycle == 1 ) %>% sample_frac(0.05) # Identify two clusters sessions_clusters <- cluster_sessions( sessions_day, k=2, seed = 1234, log = TRUE ) # Plot the clusters found plot_bivarGMM( sessions = sessions_clusters$sessions, models = sessions_clusters$models, log = TRUE, start = 3 ) # Define the clusters with user profile interpretations clusters_definitions <- define_clusters( models = sessions_clusters$models, interpretations = c( "Connections during working hours", "Connections during all day (high variability)" ), profile_names = c("Workers", "Visitors"), log = TRUE ) # Classify each session to the corresponding user profile sessions_profiles <- set_profiles( sessions_clustered = list(sessions_clusters$sessions), clusters_definition = list(clusters_definitions) )
Statistic summary of sessions features
summarise_sessions( sessions, .funs, vars = evprof::sessions_summary_feature_names )
summarise_sessions( sessions, .funs, vars = evprof::sessions_summary_feature_names )
sessions |
tibble, sessions data set in evprof standard format. standard format. |
.funs |
A function to compute, e.g. |
vars |
character vector, variables to compute the histogram for |
Summary table
summarise_sessions(california_ev_sessions, mean)
summarise_sessions(california_ev_sessions, mean)