Introduction

In this notebook, we explore the transcriptomic and clinical data from the TCGA Prostate Adenocarcinoma (PRAD) cohort. The goal is to perform data exploration and apply unsupervised machine learning methods such as hierarchical clustering, k-means, and dimensionality reduction techniques (PCA and t-SNE).

We use three datasets located in the data/ folder:


Load Required Packages

library(ggplot2)
library(tidyverse)
library(pheatmap)
library(broom)
library(skimr)
library(gtsummary)
library(ggfortify)
library(Rtsne)
Warning: package ‘Rtsne’ was built under R version 4.3.3

Load Data

feno <- readRDS("data/feno.RDS")
exp <- readRDS("data/exp.RDS")
exp_top <- readRDS("data/exp_top.RDS")

Initial Data Inspection

dim(exp)
[1] 20530   549
dim(exp_top)
[1] 549 100

We can also take a glimpse of the data structures:

glimpse(feno)
Rows: 549
Columns: 16
$ sample_type                         <fct> Primary Tumor, Primary Tumor, Primary Tumor, Primary Tumor,…
$ age                                 <int> 51, 57, 47, 52, 70, 54, 69, 57, 57, 56, 64, 73, 72, 65, 57,…
$ initial_weight                      <int> 140, 110, 140, 140, 160, 140, 150, 150, 80, 140, 380, NA, N…
$ histological_type                   <chr> "Prostate Adenocarcinoma Acinar Type", "Prostate Adenocarci…
$ ISUP_group                          <fct> 1, 1, 5, 1, 4, 3, 5, 2, 1, 1, 4, 3, 1, 2, 2, 5, 2, 3, 2, 2,…
$ clinical_T_simple                   <fct> NA, T1, T2, T2, T2, T1, T2, T1, T1, T1, T2, NA, NA, NA, NA,…
$ pathologic_T_simple                 <fct> T2, T3, T4, T2, T3, T3, T3, T3, T2, T3, T3, T2, NA, T3, T2,…
$ laterality                          <chr> "Left", "Bilateral", "Bilateral", "Right", "Bilateral", "Le…
$ PFI                                 <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
$ PFI.time                            <int> 621, 1701, 1373, 671, 1378, 112, 198, 1364, 1272, 615, 350,…
$ PSA                                 <dbl> 2.7, 5.0, 44.9, 5.6, 2.4, 7.0, 7.1, 11.6, 4.6, 8.4, 8.4, 14…
$ targeted_molecular_therapy          <fct> NO, NO, YES, NO, NO, NA, NO, NO, NO, NO, YES, NA, NO, NO, N…
$ radiation_therapy                   <fct> NO, NO, YES, NO, NO, NA, NO, NO, NO, NO, NO, NA, NO, NO, NA…
$ is_ffpe                             <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",…
$ initial_pathologic_diagnosis_method <chr> "Core needle biopsy", "Core needle biopsy", "Core needle bi…
$ progression                         <lgl> NA, FALSE, FALSE, NA, FALSE, NA, TRUE, FALSE, FALSE, NA, NA…

Preprocessing

We will:

  1. Transpose the expression matrix to have genes as columns.
  2. Remove genes that have zero expression across all samples.
  3. Filter to include only primary tumor samples.
  4. Synchronize sample IDs across datasets.
  5. Recode the ISUP group as an ordered factor.
exp <- t(exp)
exp <- exp[, colSums(exp != 0) > 0]

# Keep only primary tumor samples
feno <- feno %>% filter(sample_type == "Primary Tumor")
exp <- exp[rownames(feno), ]
exp_top <- exp_top[rownames(feno), ]

# Set ISUP group as ordered factor
feno$ISUP_group <- factor(feno$ISUP_group, ordered = TRUE)
feno <- droplevels(feno)

# Add numeric version of ISUP group
feno$ISUP_group_num <- as.numeric(feno$ISUP_group)

Clinical Summary Table

We summarize the clinical characteristics using gtsummary.

feno %>%
  tbl_summary(
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 2
  )
Characteristic N = 4971
sample_type
    Primary Tumor 497 (100%)
age 61.02 (6.82)
initial_weight 243.13 (293.09)
    Unknown 227
histological_type
    Prostate Adenocarcinoma Acinar Type 482 (97%)
    Prostate Adenocarcinoma, Other Subtype 15 (3.0%)
ISUP_group
    1 45 (9.1%)
    2 146 (29%)
    3 101 (20%)
    4 64 (13%)
    5 141 (28%)
clinical_T_simple
    T1 177 (44%)
    T2 173 (43%)
    T3 53 (13%)
    T4 2 (0.5%)
    Unknown 92
pathologic_T_simple
    T2 187 (38%)
    T3 293 (60%)
    T4 10 (2.0%)
    Unknown 7
laterality
    Bilateral 432 (88%)
    Left 19 (3.9%)
    Right 38 (7.8%)
    Unknown 8
PFI 93 (19%)
PFI.time 964.55 (756.34)
PSA 11.05 (12.37)
    Unknown 15
targeted_molecular_therapy 52 (12%)
    Unknown 50
radiation_therapy 59 (13%)
    Unknown 49
is_ffpe
    NO 440 (100%)
    Unknown 57
initial_pathologic_diagnosis_method
    Core needle biopsy 477 (97%)
    Transurethral resection (TURBT) 15 (3.0%)
    Unknown 5
progression 72 (30%)
    Unknown 254
ISUP_group_num
    1 45 (9.1%)
    2 146 (29%)
    3 101 (20%)
    4 64 (13%)
    5 141 (28%)
1 n (%); Mean (SD)

Unsupervised Clustering

Hierarchical Clustering with Heatmap

We use the top variable genes (exp_top) to reduce dimensionality for visualization.

pheatmap(t(exp_top), show_rownames = FALSE, show_colnames = FALSE, scale = "row")

Now we add clinical annotations to the heatmap.

annotations <- data.frame(feno %>% 
                            select(ISUP_group,
                                   ISUP_group_num,
                                   histological_type, 
                                   clinical_T_simple, 
                                   pathologic_T_simple, 
                                   initial_pathologic_diagnosis_method))

pheatmap(t(exp_top),
         scale = "row",
         annotation = annotations,
         show_rownames = FALSE, 
         show_colnames = FALSE)


Principal Component Analysis (PCA)

We apply PCA on the full expression matrix and visualize the principal components.

exp <- exp[, colSums(exp != 0) > 0]
pca <- prcomp(exp, scale. = TRUE)
autoplot(pca)

We can also color the PCA plot by clinical covariates:

autoplot(pca, data = feno, colour = "ISUP_group")

autoplot(pca, data = feno, colour = "age")


K-Means Clustering

We apply k-means clustering using 5 clusters on the exp_top matrix.

set.seed(123)
km <- kmeans(exp_top, centers = 5, nstart = 25)
feno$cluster_kmeans <- factor(km$cluster)

# Visualize clusters in PCA space
autoplot(pca, data = feno, colour = "cluster_kmeans")


Dimensionality Reduction with t-SNE

We use t-SNE to project samples into two dimensions and visualize the separation between clusters and clinical groups.

set.seed(123)
tsne <- Rtsne(exp_top, perplexity = 30)
plot(tsne$Y, col = km$cluster, pch = 19, main = "t-SNE + K-means Clusters")

plot(tsne$Y, col = feno$ISUP_group, pch = 19, main = "t-SNE Colored by ISUP Group")


Clinical Association Example

Finally, we test whether the identified clusters differ in clinical features. Here, we compare PSA levels across k-means clusters.

boxplot(feno$PSA ~ feno$cluster_kmeans,
        xlab = "K-means Cluster",
        ylab = "Preoperative PSA",
        main = "PSA Levels by Cluster")

LS0tDQp0aXRsZTogIkV4cGxvcmF0b3J5IGFuZCBVbnN1cGVydmlzZWQgQW5hbHlzaXMgb2YgVENHQSBQcm9zdGF0ZSBDYW5jZXIgVHJhbnNjcmlwdG9taWMgRGF0YSINCmF1dGhvcjogIkp1YW4gQml6em90dG8iDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpgYGB7ciBzZXR1cCwgaW5jbHVkZT1GQUxTRX0NCmtuaXRyOjpvcHRzX2NodW5rJHNldChlY2hvID0gVFJVRSkNCmBgYA0KDQojIyBJbnRyb2R1Y3Rpb24NCg0KSW4gdGhpcyBub3RlYm9vaywgd2UgZXhwbG9yZSB0aGUgdHJhbnNjcmlwdG9taWMgYW5kIGNsaW5pY2FsIGRhdGEgZnJvbSB0aGUgVENHQSBQcm9zdGF0ZSBBZGVub2NhcmNpbm9tYSAoUFJBRCkgY29ob3J0LiBUaGUgZ29hbCBpcyB0byBwZXJmb3JtICoqZGF0YSBleHBsb3JhdGlvbioqIGFuZCBhcHBseSAqKnVuc3VwZXJ2aXNlZCBtYWNoaW5lIGxlYXJuaW5nIG1ldGhvZHMqKiBzdWNoIGFzICoqaGllcmFyY2hpY2FsIGNsdXN0ZXJpbmcqKiwgKiprLW1lYW5zKiosIGFuZCAqKmRpbWVuc2lvbmFsaXR5IHJlZHVjdGlvbioqIHRlY2huaXF1ZXMgKFBDQSBhbmQgdC1TTkUpLg0KDQpXZSB1c2UgdGhyZWUgZGF0YXNldHMgbG9jYXRlZCBpbiB0aGUgYGRhdGEvYCBmb2xkZXI6DQoNCi0gYGZlbm8uUkRTYDogQ2xpbmljYWwgYW5kIHBoZW5vdHlwaWMgZGF0YS4NCi0gYGV4cC5SRFNgOiBDb21wbGV0ZSBnZW5lIGV4cHJlc3Npb24gbWF0cml4IChSTkEtU2VxKS4NCi0gYGV4cF90b3AuUkRTYDogQSBzdWJzZXQgb2YgdGhlIGV4cHJlc3Npb24gbWF0cml4IGluY2x1ZGluZyB0aGUgdG9wIG1vc3QgdmFyaWFibGUgZ2VuZXMsIHVzZWZ1bCBmb3IgY2x1c3RlcmluZyBhbmQgdmlzdWFsaXphdGlvbi4NCg0KLS0tDQoNCiMjIExvYWQgUmVxdWlyZWQgUGFja2FnZXMNCg0KYGBge3IgcGFja2FnZXN9DQpsaWJyYXJ5KGdncGxvdDIpDQpsaWJyYXJ5KHRpZHl2ZXJzZSkNCmxpYnJhcnkocGhlYXRtYXApDQpsaWJyYXJ5KGJyb29tKQ0KbGlicmFyeShza2ltcikNCmxpYnJhcnkoZ3RzdW1tYXJ5KQ0KbGlicmFyeShnZ2ZvcnRpZnkpDQpsaWJyYXJ5KFJ0c25lKQ0KYGBgDQoNCi0tLQ0KDQojIyBMb2FkIERhdGENCg0KYGBge3IgbG9hZC1kYXRhfQ0KZmVubyA8LSByZWFkUkRTKCJkYXRhL2Zlbm8uUkRTIikNCmV4cCA8LSByZWFkUkRTKCJkYXRhL2V4cC5SRFMiKQ0KZXhwX3RvcCA8LSByZWFkUkRTKCJkYXRhL2V4cF90b3AuUkRTIikNCmBgYA0KDQotLS0NCg0KIyMgSW5pdGlhbCBEYXRhIEluc3BlY3Rpb24NCg0KYGBge3IgaW5zcGVjdH0NCmRpbShleHApDQpkaW0oZXhwX3RvcCkNCmBgYA0KDQpXZSBjYW4gYWxzbyB0YWtlIGEgZ2xpbXBzZSBvZiB0aGUgZGF0YSBzdHJ1Y3R1cmVzOg0KDQpgYGB7ciBnbGltcHNlfQ0KZ2xpbXBzZShmZW5vKQ0KYGBgDQoNCi0tLQ0KDQojIyBQcmVwcm9jZXNzaW5nDQoNCldlIHdpbGw6DQoNCjEuIFRyYW5zcG9zZSB0aGUgZXhwcmVzc2lvbiBtYXRyaXggdG8gaGF2ZSBnZW5lcyBhcyBjb2x1bW5zLg0KMi4gUmVtb3ZlIGdlbmVzIHRoYXQgaGF2ZSB6ZXJvIGV4cHJlc3Npb24gYWNyb3NzIGFsbCBzYW1wbGVzLg0KMy4gRmlsdGVyIHRvIGluY2x1ZGUgb25seSBwcmltYXJ5IHR1bW9yIHNhbXBsZXMuDQo0LiBTeW5jaHJvbml6ZSBzYW1wbGUgSURzIGFjcm9zcyBkYXRhc2V0cy4NCjUuIFJlY29kZSB0aGUgSVNVUCBncm91cCBhcyBhbiBvcmRlcmVkIGZhY3Rvci4NCg0KYGBge3IgcHJlcHJvY2Vzc30NCmV4cCA8LSB0KGV4cCkNCmV4cCA8LSBleHBbLCBjb2xTdW1zKGV4cCAhPSAwKSA+IDBdDQoNCiMgS2VlcCBvbmx5IHByaW1hcnkgdHVtb3Igc2FtcGxlcw0KZmVubyA8LSBmZW5vICU+JSBmaWx0ZXIoc2FtcGxlX3R5cGUgPT0gIlByaW1hcnkgVHVtb3IiKQ0KZXhwIDwtIGV4cFtyb3duYW1lcyhmZW5vKSwgXQ0KZXhwX3RvcCA8LSBleHBfdG9wW3Jvd25hbWVzKGZlbm8pLCBdDQoNCiMgU2V0IElTVVAgZ3JvdXAgYXMgb3JkZXJlZCBmYWN0b3INCmZlbm8kSVNVUF9ncm91cCA8LSBmYWN0b3IoZmVubyRJU1VQX2dyb3VwLCBvcmRlcmVkID0gVFJVRSkNCmZlbm8gPC0gZHJvcGxldmVscyhmZW5vKQ0KDQojIEFkZCBudW1lcmljIHZlcnNpb24gb2YgSVNVUCBncm91cA0KZmVubyRJU1VQX2dyb3VwX251bSA8LSBhcy5udW1lcmljKGZlbm8kSVNVUF9ncm91cCkNCmBgYA0KDQotLS0NCg0KIyMgQ2xpbmljYWwgU3VtbWFyeSBUYWJsZQ0KDQpXZSBzdW1tYXJpemUgdGhlIGNsaW5pY2FsIGNoYXJhY3RlcmlzdGljcyB1c2luZyBgZ3RzdW1tYXJ5YC4NCg0KYGBge3Igc3VtbWFyeS10YWJsZX0NCmZlbm8gJT4lDQogIHRibF9zdW1tYXJ5KA0KICAgIHN0YXRpc3RpYyA9IGxpc3QoDQogICAgICBhbGxfY29udGludW91cygpIH4gInttZWFufSAoe3NkfSkiLA0KICAgICAgYWxsX2NhdGVnb3JpY2FsKCkgfiAie259ICh7cH0lKSINCiAgICApLA0KICAgIGRpZ2l0cyA9IGFsbF9jb250aW51b3VzKCkgfiAyDQogICkNCmBgYA0KDQotLS0NCg0KIyMgVW5zdXBlcnZpc2VkIENsdXN0ZXJpbmcNCg0KIyMjIEhpZXJhcmNoaWNhbCBDbHVzdGVyaW5nIHdpdGggSGVhdG1hcA0KDQpXZSB1c2UgdGhlIHRvcCB2YXJpYWJsZSBnZW5lcyAoYGV4cF90b3BgKSB0byByZWR1Y2UgZGltZW5zaW9uYWxpdHkgZm9yIHZpc3VhbGl6YXRpb24uDQoNCmBgYHtyIGhlYXRtYXAtYmFzaWN9DQpwaGVhdG1hcCh0KGV4cF90b3ApLCBzaG93X3Jvd25hbWVzID0gRkFMU0UsIHNob3dfY29sbmFtZXMgPSBGQUxTRSwgc2NhbGUgPSAicm93IikNCmBgYA0KDQpOb3cgd2UgYWRkIGNsaW5pY2FsIGFubm90YXRpb25zIHRvIHRoZSBoZWF0bWFwLg0KDQpgYGB7ciBoZWF0bWFwLWFubm90YXRlZH0NCmFubm90YXRpb25zIDwtIGRhdGEuZnJhbWUoZmVubyAlPiUgDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgc2VsZWN0KElTVVBfZ3JvdXAsDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIElTVVBfZ3JvdXBfbnVtLA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBoaXN0b2xvZ2ljYWxfdHlwZSwgDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGNsaW5pY2FsX1Rfc2ltcGxlLCANCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgcGF0aG9sb2dpY19UX3NpbXBsZSwgDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGluaXRpYWxfcGF0aG9sb2dpY19kaWFnbm9zaXNfbWV0aG9kKSkNCg0KcGhlYXRtYXAodChleHBfdG9wKSwNCiAgICAgICAgIHNjYWxlID0gInJvdyIsDQogICAgICAgICBhbm5vdGF0aW9uID0gYW5ub3RhdGlvbnMsDQogICAgICAgICBzaG93X3Jvd25hbWVzID0gRkFMU0UsIA0KICAgICAgICAgc2hvd19jb2xuYW1lcyA9IEZBTFNFKQ0KYGBgDQoNCi0tLQ0KDQojIyBQcmluY2lwYWwgQ29tcG9uZW50IEFuYWx5c2lzIChQQ0EpDQoNCldlIGFwcGx5IFBDQSBvbiB0aGUgZnVsbCBleHByZXNzaW9uIG1hdHJpeCBhbmQgdmlzdWFsaXplIHRoZSBwcmluY2lwYWwgY29tcG9uZW50cy4NCg0KYGBge3IgcGNhLWJhc2ljfQ0KZXhwIDwtIGV4cFssIGNvbFN1bXMoZXhwICE9IDApID4gMF0NCnBjYSA8LSBwcmNvbXAoZXhwLCBzY2FsZS4gPSBUUlVFKQ0KYXV0b3Bsb3QocGNhKQ0KYGBgDQoNCldlIGNhbiBhbHNvIGNvbG9yIHRoZSBQQ0EgcGxvdCBieSBjbGluaWNhbCBjb3ZhcmlhdGVzOg0KDQpgYGB7ciBwY2EtY29sb3JlZH0NCmF1dG9wbG90KHBjYSwgZGF0YSA9IGZlbm8sIGNvbG91ciA9ICJJU1VQX2dyb3VwIikNCmF1dG9wbG90KHBjYSwgZGF0YSA9IGZlbm8sIGNvbG91ciA9ICJhZ2UiKQ0KYGBgDQoNCi0tLQ0KDQojIyBLLU1lYW5zIENsdXN0ZXJpbmcNCg0KV2UgYXBwbHkgay1tZWFucyBjbHVzdGVyaW5nIHVzaW5nIDUgY2x1c3RlcnMgb24gdGhlIGBleHBfdG9wYCBtYXRyaXguDQoNCmBgYHtyIGttZWFuc30NCnNldC5zZWVkKDEyMykNCmttIDwtIGttZWFucyhleHBfdG9wLCBjZW50ZXJzID0gNSwgbnN0YXJ0ID0gMjUpDQpmZW5vJGNsdXN0ZXJfa21lYW5zIDwtIGZhY3RvcihrbSRjbHVzdGVyKQ0KDQojIFZpc3VhbGl6ZSBjbHVzdGVycyBpbiBQQ0Egc3BhY2UNCmF1dG9wbG90KHBjYSwgZGF0YSA9IGZlbm8sIGNvbG91ciA9ICJjbHVzdGVyX2ttZWFucyIpDQpgYGANCg0KLS0tDQoNCiMjIERpbWVuc2lvbmFsaXR5IFJlZHVjdGlvbiB3aXRoIHQtU05FDQoNCldlIHVzZSB0LVNORSB0byBwcm9qZWN0IHNhbXBsZXMgaW50byB0d28gZGltZW5zaW9ucyBhbmQgdmlzdWFsaXplIHRoZSBzZXBhcmF0aW9uIGJldHdlZW4gY2x1c3RlcnMgYW5kIGNsaW5pY2FsIGdyb3Vwcy4NCg0KYGBge3IgdHNuZX0NCnNldC5zZWVkKDEyMykNCnRzbmUgPC0gUnRzbmUoZXhwX3RvcCwgcGVycGxleGl0eSA9IDMwKQ0KYGBgDQoNCmBgYHtyIHRzbmUtcGxvdH0NCnBsb3QodHNuZSRZLCBjb2wgPSBrbSRjbHVzdGVyLCBwY2ggPSAxOSwgbWFpbiA9ICJ0LVNORSArIEstbWVhbnMgQ2x1c3RlcnMiKQ0KcGxvdCh0c25lJFksIGNvbCA9IGZlbm8kSVNVUF9ncm91cCwgcGNoID0gMTksIG1haW4gPSAidC1TTkUgQ29sb3JlZCBieSBJU1VQIEdyb3VwIikNCmBgYA0KDQotLS0NCg0KIyMgQ2xpbmljYWwgQXNzb2NpYXRpb24gRXhhbXBsZQ0KDQpGaW5hbGx5LCB3ZSB0ZXN0IHdoZXRoZXIgdGhlIGlkZW50aWZpZWQgY2x1c3RlcnMgZGlmZmVyIGluIGNsaW5pY2FsIGZlYXR1cmVzLiBIZXJlLCB3ZSBjb21wYXJlIFBTQSBsZXZlbHMgYWNyb3NzIGstbWVhbnMgY2x1c3RlcnMuDQoNCmBgYHtyIHBzYS1ib3hwbG90fQ0KYm94cGxvdChmZW5vJFBTQSB+IGZlbm8kY2x1c3Rlcl9rbWVhbnMsDQogICAgICAgIHhsYWIgPSAiSy1tZWFucyBDbHVzdGVyIiwNCiAgICAgICAgeWxhYiA9ICJQcmVvcGVyYXRpdmUgUFNBIiwNCiAgICAgICAgbWFpbiA9ICJQU0EgTGV2ZWxzIGJ5IENsdXN0ZXIiKQ0KYGBgDQoNCg0K