Title: | OPTICS K-Xi Density-Based Clustering |
---|---|
Description: | Provides a novel density-based cluster extraction method, OPTICS k-Xi, and a framework to compare k-Xi models using distance-based metrics to investigate datasets with unknown number of clusters. |
Authors: | Thomas Charlon [aut, cre] |
Maintainer: | Thomas Charlon <[email protected]> |
License: | GPL-3 |
Version: | 1.1.0 |
Built: | 2024-11-23 05:05:14 UTC |
Source: | https://github.com/thomaschln/opticskxi |
Include NAs and add totals to table.
contingency_table(...)
contingency_table(...)
... |
Passed to table |
Table object
The data set consist of 103 common (>5% minor allele frequency) SNPs genotyped in 129 trios from an European-derived population. These SNPs are in a 500-kb region on human chromosome 5q31 implicated as containing a genetic risk factor for Crohn disease.
Imported from the gap R package.
An example use of the data is with the following paper, Kelly M. Burkett, Celia M. T. Greenwood, BradMcNeney, Jinko Graham. Gene genealogies for genetic association mapping, with application to Crohn's disease. Fron Genet 2013, 4(260) doi: 10.3389/fgene.2013.00260
data(crohn)
data(crohn)
A data frame containing 387 rows and 212 columns
MJ Daly, JD Rioux, SF Schaffner, TJ Hudson, ES Lander (2001) High-resolution haplotype structure in the human genome Nature Genetics 29:229-232
Fortify a dimension reduction object
fortify_dimred( m_dimred, m_vars = NULL, v_variance = NULL, sup_vars = NULL, var_digits = 1 )
fortify_dimred( m_dimred, m_vars = NULL, v_variance = NULL, sup_vars = NULL, var_digits = 1 )
m_dimred |
Projection matrix |
m_vars |
Rotation matrix (optional) |
v_variance |
Explained variance (optional) |
sup_vars |
Optional supplementary variables |
var_digits |
Explained variance percent digits |
Data frame
pca <- prcomp(iris[-5]) df_pca <- fortify_dimred(pca$x)
pca <- prcomp(iris[-5]) df_pca <- fortify_dimred(pca$x)
Get and fortify ICA
fortify_ica(m_data, ..., sup_vars = NULL)
fortify_ica(m_data, ..., sup_vars = NULL)
m_data |
Input matrix |
... |
Passed to fastICA::fastICA |
sup_vars |
Optional supplementary variables |
Fortified dimension reduction
df_ica <- fortify_ica(iris[-5], n.comp = 2)
df_ica <- fortify_ica(iris[-5], n.comp = 2)
Get and fortify PCA
fortify_pca(m_data, ..., sup_vars = NULL)
fortify_pca(m_data, ..., sup_vars = NULL)
m_data |
Input matrix |
... |
Passed to stats::prcomp |
sup_vars |
Optional supplementary variables |
Fortified dimension reduction
df_pca <- fortify_pca(iris[-5]) df_pca <- fortify_pca(iris[-5], sup_vars = iris[5])
df_pca <- fortify_pca(iris[-5]) df_pca <- fortify_pca(iris[-5], sup_vars = iris[5])
Select k-Xi clustering model based on a metric and a rank
get_best_kxi(df_kxi, metric = "avg.silwidth", rank = 1)
get_best_kxi(df_kxi, metric = "avg.silwidth", rank = 1)
df_kxi |
Data frame returned by opticsxi_pipeline |
metric |
Metric to choose best model |
rank |
Rank(s) of model to choose, ordered by decreasing metric |
df_kxi row with specified metric and rank, simplified to a list if only one rank selected
Plot multiple axes of a data frame or a fortified dimension reduction.
ggpairs( df_data, group = NULL, axes = 1:2, variables = FALSE, n_vars = 0, ellipses = FALSE, ..., title = NULL, colors = if (!is.null(group)) nice_palette(df_data[[group]]) )
ggpairs( df_data, group = NULL, axes = 1:2, variables = FALSE, n_vars = 0, ellipses = FALSE, ..., title = NULL, colors = if (!is.null(group)) nice_palette(df_data[[group]]) )
df_data |
Data frame |
group |
Column name of the grouping of observations |
axes |
Axes to plot. If more than 2, plots all pair combinations |
variables |
Logical, plot variable contributions of the dimension reduction to the selected axes, only for 2 axes |
n_vars |
Maximum number of variable contributions to plot. By default 0, for all variables. |
ellipses |
Logical, plot ellipses of groups |
... |
Passed to ggplot2 stat_ellipse if ellipses are requested |
title |
String to add as title, default NULL |
colors |
Vector of colors for each group |
ggmatrix
df_pca <- fortify_pca(iris[-5]) ggpairs(df_pca) df_pca <- fortify_pca(iris[-5], sup_vars = iris[5]) ggpairs(df_pca, group = 'Species', ellipses = TRUE, variables = TRUE)
df_pca <- fortify_pca(iris[-5]) ggpairs(df_pca) df_pca <- fortify_pca(iris[-5], sup_vars = iris[5]) ggpairs(df_pca, group = 'Species', ellipses = TRUE, variables = TRUE)
Plot metrics of a kxi_pipeline output
ggplot_kxi_metrics(df_kxi, metric = c("avg.silwidth", "bw.ratio"), n = 8)
ggplot_kxi_metrics(df_kxi, metric = c("avg.silwidth", "bw.ratio"), n = 8)
df_kxi |
Data frame returned by opticskxi_pipeline |
metric |
Vector of metrics to display from the df_kxi object |
n |
Number of best models for the first metric to display |
ggplot
Plot OPTICS reachability plot.
ggplot_optics( optics_obj, groups = NULL, colors = if (!is.null(groups)) nice_palette(groups), segment_size = 300/nrow(df_optics) )
ggplot_optics( optics_obj, groups = NULL, colors = if (!is.null(groups)) nice_palette(groups), segment_size = 300/nrow(df_optics) )
optics_obj |
dbscan::optics object |
groups |
Optional vector defining groups of OPTICS observations |
colors |
If groups specified, vector of colors for each group |
segment_size |
Size for geom_segment |
ggplot
data('multishapes') optics_obj <- dbscan::optics(multishapes[1:2]) ggplot_optics(optics_obj) ggplot_optics(optics_obj, groups = opticskxi(optics_obj, n_xi = 5, pts = 30))
data('multishapes') optics_obj <- dbscan::optics(multishapes[1:2]) ggplot_optics(optics_obj) ggplot_optics(optics_obj, groups = opticskxi(optics_obj, n_xi = 5, pts = 30))
Plot OPTICS distance profiles of k-Xi clustering models
gtable_kxi_profiles(df_kxi, metric = "avg.silwidth", rank = 1:4, ...)
gtable_kxi_profiles(df_kxi, metric = "avg.silwidth", rank = 1:4, ...)
df_kxi |
Data frame returned by opticskxi_pipeline |
metric |
Metric to choose best clustering model |
rank |
Ranks of models to plot, ordered by decreasing model metric |
... |
Passed to ggplot_kxi_profile |
This data set contains HLA markers DRB, DQA, DQB and phenotypes of 271 Schizophrenia patients (y=1) and controls (y=0). Genotypes for 3 HLA loci have prefixes name (e.g., "DQB") and a suffix for each of two alleles (".a1" and ".a2").
Imported from the gap package.
data(hla)
data(hla)
A data frame containing 271 rows and 8 columns
Dr Padraig Wright of Pfizer
Data containing Glove embeddings of psychological related words, useful for demonstrating the use of the modified opticskxi pipeline psychkxi.
data("m_psychwords")
data("m_psychwords")
A matrix with 800 words in rows and 100 embedding dimensions in columns.
The dataset contains 2 main hierarchical clusters (each has subclusters).
data('m_psychwords') df_params = expand.grid(n_xi = 9:10, pts = c(15, 20), dist = 'cosine', dim_red = 'ICA', n_dimred_comp = c(10, 15)) df_kxi = opticskxi:::psych_kxi_ensemble_models(m_psychwords, df_params)
data('m_psychwords') df_params = expand.grid(n_xi = 9:10, pts = c(15, 20), dist = 'cosine', dim_red = 'ICA', n_dimred_comp = c(10, 15)) df_kxi = opticskxi:::psych_kxi_ensemble_models(m_psychwords, df_params)
Data containing clusters of any shapes. Useful for comparing density-based clustering (DBSCAN) and standard partitioning methods such as k-means clustering. Imported from the factoextra package.
data("multishapes")
data("multishapes")
A data frame with 1100 observations on the following 3 variables.
x
a numeric vector containing the x coordinates of observations
y
a numeric vector containing the y coordinates of observations
shape
a numeric vector corresponding to the cluster number of each observations.
The dataset contains 5 clusters and some outliers/noises.
data('multishapes') plot(multishapes[, 1], multishapes[, 2], col = multishapes[, 3], pch = 19, cex = 0.8)
data('multishapes') plot(multishapes[, 1], multishapes[, 2], col = multishapes[, 3], pch = 19, cex = 0.8)
Color palette
nice_palette(groups, rainbow = FALSE)
nice_palette(groups, rainbow = FALSE)
groups |
Vector, each unique value will get a color |
rainbow |
If TRUE, rainbow-like colors, else differentiate successive values |
Vector of colors
For each largest distance differences on the OPTICS profile, consecutive observations left and right on the OPTICS profile (i.e. lower and higher OPTICS id) will be assigned to 2 different clusters if their distance is below the distance of the edge point. If above, observations are NA. The pts parameter defines a minimum number of observations to form a valley (i.e. cluster). If the number of observations in one valley is smaller than pts, observations are set to NA.
opticskxi( optics_obj, n_xi, pts = optics_obj$minPts, max_loop = 50, verbose = FALSE )
opticskxi( optics_obj, n_xi, pts = optics_obj$minPts, max_loop = 50, verbose = FALSE )
optics_obj |
Data frame returned by optics |
n_xi |
Number of clusters to define |
pts |
Minimum number of points per clusters |
max_loop |
Maximum iterations to find n_xi clusters |
verbose |
Print the ids of the largest difference considered and cluster information if they define one |
Vector of clusters
opticskxi_pipeline, ggplot_optics
data('multishapes') optics_shapes <- dbscan::optics(multishapes[1:2]) kxi_shapes <- opticskxi(optics_shapes, n_xi = 5, pts = 30) ggplot_optics(optics_shapes, groups = kxi_shapes) ggpairs(cbind(multishapes[1:2], kXi = kxi_shapes), group = 'kXi')
data('multishapes') optics_shapes <- dbscan::optics(multishapes[1:2]) kxi_shapes <- opticskxi(optics_shapes, n_xi = 5, pts = 30) ggplot_optics(optics_shapes, groups = kxi_shapes) ggpairs(cbind(multishapes[1:2], kXi = kxi_shapes), group = 'kXi')
Computes OPTICS k-Xi models based on a parameter grid, binds results in a data frame, and computes distance based metrics for each model.
opticskxi_pipeline( m_data, df_params = expand.grid(n_xi = 1:10, pts = c(20, 30, 40), dist = c("euclidean", "abscorrelation"), dim_red = c("identity", "PCA", "ICA"), n_dimred_comp = c(5, 10, 20)), n_cores = 1 )
opticskxi_pipeline( m_data, df_params = expand.grid(n_xi = 1:10, pts = c(20, 30, 40), dist = c("euclidean", "abscorrelation"), dim_red = c("identity", "PCA", "ICA"), n_dimred_comp = c(5, 10, 20)), n_cores = 1 )
m_data |
Data matrix |
df_params |
Parameter grid for the OPTICS k-Xi function call and optional dimension reduction. Required columns: n_xi, pts, dist. Optonal columns: dim_red, n_dim_red. |
n_cores |
Number of cores |
Input parameter data frame with with results binded in columns optics, clusters and metrics.
get_best_kxi, ggplot_kxi_metrics, gtable_kxi_profiles
data('hla') m_hla <- hla[-c(1:2)] %>% scale df_params_hla <- expand.grid(n_xi = 3:5, pts = c(20, 30), dist = c('manhattan', 'euclidean')) df_kxi_hla <- opticskxi_pipeline(m_hla, df_params_hla) ggplot_kxi_metrics(df_kxi_hla, n = 8) gtable_kxi_profiles(df_kxi_hla) %>% plot best_kxi_hla <- get_best_kxi(df_kxi_hla, rank = 2) clusters_hla <- best_kxi_hla$clusters fortify_pca(m_hla, sup_vars = data.frame(Clusters = clusters_hla)) %>% ggpairs('Clusters', ellipses = TRUE, variables = TRUE)
data('hla') m_hla <- hla[-c(1:2)] %>% scale df_params_hla <- expand.grid(n_xi = 3:5, pts = c(20, 30), dist = c('manhattan', 'euclidean')) df_kxi_hla <- opticskxi_pipeline(m_hla, df_params_hla) ggplot_kxi_metrics(df_kxi_hla, n = 8) gtable_kxi_profiles(df_kxi_hla) %>% plot best_kxi_hla <- get_best_kxi(df_kxi_hla, rank = 2) clusters_hla <- best_kxi_hla$clusters fortify_pca(m_hla, sup_vars = data.frame(Clusters = clusters_hla)) %>% ggpairs('Clusters', ellipses = TRUE, variables = TRUE)
Print knitr::kable latex table with legend at bottom.
print_table(table_obj, label)
print_table(table_obj, label)
table_obj |
Table object |
label |
Latex label |
None
Bind contingency table and Pearson Chi-squared residuals.
residuals_table(...)
residuals_table(...)
... |
Passed to contingency_table and chisq.test |
Matrix