GSE92332 Processing Pipeline

1

3' polyA RNA-seq

RNA-seq v1.10.2 GitHub

$ Bash example

# Install Salmon (if not already installed)
# conda install -c bioconda salmon

# --- Reference Data Setup ---
# Download human transcriptome FASTA (e.g., Ensembl GRCh38 release 110 cdna)
# Replace with actual paths to your reference files
# wget -O Homo_sapiens.GRCh38.cdna.all.fa.gz http://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz
# gunzip Homo_sapiens.GRCh38.cdna.all.fa.gz

TRANSCRIPTOME_FASTA="Homo_sapiens.GRCh38.cdna.all.fa" # Path to your transcriptome FASTA
SALMON_INDEX_DIR="salmon_grch38_index"
OUTPUT_DIR="salmon_quant_output"
READ1="sample_R1.fastq.gz" # Replace with your actual R1 reads
READ2="sample_R2.fastq.gz" # Replace with your actual R2 reads (for paired-end)
# For single-end reads, use READS="sample.fastq.gz" and adjust salmon quant command
THREADS=8 # Number of threads to use

# --- Step 1: Build Salmon Index ---
# This step only needs to be run once per reference transcriptome
echo "Building Salmon index..."
salmon index -t "${TRANSCRIPTOME_FASTA}" -i "${SALMON_INDEX_DIR}" --threads "${THREADS}"

# --- Step 2: Quantify 3' polyA RNA-seq reads ---
echo "Quantifying 3' polyA RNA-seq reads with Salmon..."
salmon quant \
    -i "${SALMON_INDEX_DIR}" \
    -l A \
    -1 "${READ1}" \
    -2 "${READ2}" \
    --gcBias \
    --seqBias \
    -p "${THREADS}" \
    -o "${OUTPUT_DIR}"

echo "Salmon quantification complete. Results are in ${OUTPUT_DIR}"

View on GitHub

2

CellRanger 1.0

Cell Ranger v1.0

$ Bash example

# Download and install Cell Ranger (example, adjust path as needed)
# wget https://cf.10xgenomics.com/releases/cell-exp/cellranger-1.0.0.tar.gz
# tar -xzf cellranger-1.0.0.tar.gz
# export PATH=/path/to/cellranger-1.0.0:$PATH

# Create a reference transcriptome (if not already available) or download a pre-built one.
# This is a placeholder. For Cell Ranger 1.0, a common reference would be human GRCh38.
# Example download for human GRCh38 v1.0.0 reference:
# wget https://cf.10xgenomics.com/releases/cell-exp/refdata-cellranger-GRCh38-1.0.0.tar.gz
# tar -xzf refdata-cellranger-GRCh38-1.0.0.tar.gz
# REF_PATH="refdata-cellranger-GRCh38-1.0.0"

# Example usage of cellranger count for single-cell gene expression data.
# Replace with actual paths and sample names.
OUTPUT_DIR="my_cellranger_output"
REFERENCE_TRANSCRIPTOME="/path/to/your/10x_genomics_reference/refdata-cellranger-GRCh38-1.0.0" # Placeholder for GRCh38 v1.0.0
FASTQ_DIR="/path/to/your/fastq_files" # Directory containing R1, R2, and I1 fastq files
SAMPLE_NAME="my_sample"

cellranger count \
    --id="${OUTPUT_DIR}" \
    --transcriptome="${REFERENCE_TRANSCRIPTOME}" \
    --fastqs="${FASTQ_DIR}" \
    --sample="${SAMPLE_NAME}" \
    --localcores=8 \
    --localmem=64

3

PCA (prcomp in R)

PCA v4.2.0 (Inferred with models/gemini-2.5-flash) GitHub

$ Bash example

# Install R if not already installed (example for Ubuntu)
# sudo apt-get update
# sudo apt-get install r-base

# Create a dummy input TSV file for demonstration
# In a real scenario, replace 'input_matrix.tsv' with your actual data file.
echo -e "Sample\tFeature1\tFeature2\tFeature3\tFeature4\tFeature5" > input_matrix.tsv
echo -e "S1\t10\t20\t30\t40\t50" >> input_matrix.tsv
echo -e "S2\t12\t22\t33\t44\t55" >> input_matrix.tsv
echo -e "S3\t8\t18\t28\t38\t48" >> input_matrix.tsv
echo -e "S4\t11\t21\t31\t41\t51" >> input_matrix.tsv
echo -e "S5\t9\t19\t29\t39\t49" >> input_matrix.tsv

# R script to perform PCA using prcomp
Rscript -e '
  # Read input data (assuming tab-separated with header, first column as row names)
  # Adjust "input_matrix.tsv", "sep", and "row.names" according to your data format.
  data_raw <- read.table("input_matrix.tsv", header = TRUE, sep = "\t", row.names = 1)
  
  # Ensure all columns are numeric for PCA. If your data contains non-numeric columns
  # that should not be part of PCA, select only the numeric ones.
  data_numeric <- as.matrix(data_raw)
  
  # Perform PCA. 
  # `scale. = TRUE` standardizes variables to have unit variance (recommended).
  # `center. = TRUE` centers variables to have zero mean (default and recommended).
  pca_result <- prcomp(data_numeric, scale. = TRUE, center. = TRUE)

  # Save PCA results to CSV files
  # 1. Principal Components (scores): Coordinates of the observations in the principal component space.
  write.csv(pca_result$x, "pca_scores.csv", row.names = TRUE)
  
  # 2. Loadings (rotations): The matrix of variable loadings (eigenvectors).
  write.csv(pca_result$rotation, "pca_loadings.csv", row.names = TRUE)
  
  # 3. Standard deviations of the principal components.
  sdev_df <- as.data.frame(pca_result$sdev)
  colnames(sdev_df) <- "Standard_Deviation"
  write.csv(sdev_df, "pca_sdev.csv", row.names = TRUE)
  
  # 4. Calculate and save the proportion of variance explained by each principal component.
  variance_explained <- pca_result$sdev^2 / sum(pca_result$sdev^2)
  variance_explained_df <- as.data.frame(variance_explained)
  colnames(variance_explained_df) <- "Proportion_of_Variance_Explained"
  write.csv(variance_explained_df, "pca_variance_explained.csv", row.names = TRUE)
  
  # 5. Calculate and save the cumulative variance explained.
  cumulative_variance_explained <- cumsum(variance_explained)
  cumulative_variance_explained_df <- as.data.frame(cumulative_variance_explained)
  colnames(cumulative_variance_explained_df) <- "Cumulative_Variance_Explained"
  write.csv(cumulative_variance_explained_df, "pca_cumulative_variance_explained.csv", row.names = TRUE)
'

View on GitHub

4

kNN-based graph clustering (R code)

R vNot specified GitHub

$ Bash example

# Install R if not already installed
# sudo apt-get update
# sudo apt-get install r-base

# Install R packages if not already installed
# R -q -e "install.packages('RANN', repos='http://cran.us.r-project.org')"
# R -q -e "install.packages('igraph', repos='http://cran.us.r-project.org')"

# Placeholder for input data: A CSV file where rows are samples and columns are features.
# The first column is assumed to be a unique identifier (e.g., SampleID).
# Example:
# SampleID,Feature1,Feature2,Feature3
# SampleA,0.1,0.5,0.9
# SampleB,0.2,0.6,0.8
# SampleC,0.9,0.1,0.2
# SampleD,0.8,0.2,0.1
# SampleE,0.15,0.55,0.85
input_data="input_data.csv"
output_clusters="output_clusters.csv"
k_neighbors=5 # Number of nearest neighbors for graph construction

# Create a dummy input file for demonstration if it doesn't exist
if [ ! -f "$input_data" ]; then
  echo "Creating a dummy input_data.csv for demonstration."
  echo "SampleID,Feature1,Feature2,Feature3" > "$input_data"
  echo "SampleA,0.1,0.5,0.9" >> "$input_data"
  echo "SampleB,0.2,0.6,0.8" >> "$input_data"
  echo "SampleC,0.9,0.1,0.2" >> "$input_data"
  echo "SampleD,0.8,0.2,0.1" >> "$input_data"
  echo "SampleE,0.15,0.55,0.85" >> "$input_data"
  echo "SampleF,0.95,0.05,0.15" >> "$input_data"
fi

# R script for kNN-based graph clustering
R_SCRIPT="knn_graph_clustering.R"

cat << 'EOF' > "$R_SCRIPT"
# Load necessary libraries
library(RANN) # For nn2 (k-nearest neighbors search)
library(igraph) # For graph creation and community detection algorithms

# --- Parameters (passed from shell) ---
args <- commandArgs(trailingOnly = TRUE)
input_file <- args[1]
output_file <- args[2]
k_neighbors <- as.integer(args[3])

# Validate command-line arguments
if (length(args) != 3) {
  stop("Usage: Rscript knn_graph_clustering.R <input_file.csv> <output_file.csv> <k_neighbors>")
}

# Check if input file exists
if (!file.exists(input_file)) {
  stop(paste("Input file not found:", input_file))
}

# 1. Load data
# Reads a CSV file, assuming the first column contains row names (sample IDs)
# and subsequent columns are numerical features.
data <- as.matrix(read.csv(input_file, row.names = 1))

# 2. Perform kNN search to build the graph
# nn2 finds the k nearest neighbors for each point in 'data'.
# We add 1 to k_neighbors because nn2 includes the point itself in its neighbors list.
knn_results <- nn2(data, k = k_neighbors + 1)

# Extract neighbor indices, excluding the first column which corresponds to the point itself.
neighbor_indices <- knn_results$nn.idx[, -1]

# Create edges for the graph based on kNN results.
# Each point is connected to its k nearest neighbors.
edges <- c()
for (i in 1:nrow(data)) {
  for (j in 1:k_neighbors) {
    neighbor_idx <- neighbor_indices[i, j]
    # Add an edge between point 'i' and its 'j'-th nearest neighbor.
    # For an undirected graph, we only need to add each unique pair once.
    edges <- c(edges, i, neighbor_idx)
  }
}

# Create an igraph graph object.
# 'directed = FALSE' creates an undirected graph.
# 'simplify = TRUE' removes multiple edges between the same two vertices and self-loops.
g <- graph(edges, directed = FALSE)
g <- simplify(g)

# 3. Perform graph clustering (e.g., Louvain community detection)
# The Louvain algorithm is a popular method for detecting communities (clusters)
# in large networks by optimizing modularity.
# Other igraph clustering functions include cluster_walktrap, cluster_fast_greedy, cluster_leiden.
clusters <- cluster_louvain(g)

# Get the cluster assignment for each vertex (sample).
cluster_assignments <- membership(clusters)

# Create a data frame to store the results.
output_df <- data.frame(
  Sample = rownames(data),
  Cluster = cluster_assignments
)

# 4. Save results to a CSV file.
write.csv(output_df, output_file, row.names = FALSE)

message(paste("Clustering complete. Results saved to:", output_file))
EOF

# Execute the R script with the specified parameters
Rscript "$R_SCRIPT" "$input_data" "$output_clusters" "$k_neighbors"

# Clean up the R script (optional)
# rm "$R_SCRIPT"

View on GitHub

5

Full-length RNAseq (SMART-Seq2)

RNA-seq v1.3.3 GitHub

$ Bash example

# Define variables
# Replace with actual paths and filenames
GENOME_FASTA="/path/to/hg38.fa" # Example: Human genome FASTA file (e.g., hg38.fa)
GENOME_DIR="/path/to/STAR_genome_index/hg38" # Directory for STAR genome index
GTF_FILE="/path/to/gencode.v44.annotation.gtf" # Example: Gencode v44 annotation GTF for hg38
READ1="sample_R1.fastq.gz"
READ2="sample_R2.fastq.gz" # Assuming paired-end reads for SMART-Seq2
OUTPUT_PREFIX="sample_output"
RSEM_REFERENCE_NAME="hg38_rsem_ref" # Name for RSEM reference files
STAR_EXECUTABLE="STAR" # Path to STAR executable if not in PATH

# --- Installation (commented out) ---
# # Install STAR (e.g., using conda)
# # conda install -c bioconda star=2.7.10a
#
# # Install RSEM (e.g., using conda)
# # conda install -c bioconda rsem=1.3.3

# --- Reference Preparation (usually done once per genome/annotation, uncomment and run if needed) ---
# # 1. Prepare STAR genome index (if not already done)
# # This step creates the necessary genome index files for STAR alignment.
# # Adjust --sjdbOverhang based on your read length (e.g., ReadLength - 1).
# # ${STAR_EXECUTABLE} --runMode genomeGenerate \
# #      --genomeDir ${GENOME_DIR} \
# #      --genomeFastaFiles ${GENOME_FASTA} \
# #      --sjdbGTFfile ${GTF_FILE} \
# #      --sjdbOverhang 100 \
# #      --runThreadN 8
#
# # 2. Prepare RSEM reference (requires STAR genome index and GTF)
# # This step builds the RSEM reference files, which include transcript sequences and gene-transcript mappings.
# # It uses STAR internally for transcript indexing.
# # rsem-prepare-reference --gtf ${GTF_FILE} \
# #                        --star \
# #                        --star-path ${STAR_EXECUTABLE} \
# #                        --num-threads 8 \
# #                        ${GENOME_FASTA} \
# #                        ${RSEM_REFERENCE_NAME}

# --- Full-length RNAseq (SMART-Seq2) Analysis --- 

# 1. Alignment with STAR
# Aligns paired-end RNA-seq reads to the genome using a splice-aware aligner.
# The --quantMode TranscriptomeSAM option generates a transcriptome-aligned BAM file,
# which is required by RSEM for accurate quantification.
${STAR_EXECUTABLE} --genomeDir ${GENOME_DIR} \
     --readFilesIn ${READ1} ${READ2} \
     --runThreadN 8 \
     --outFileNamePrefix ${OUTPUT_PREFIX}.STAR. \
     --outSAMtype BAM SortedByCoordinate \
     --outSAMunmapped Within \
     --outSAMattributes Standard \
     --quantMode TranscriptomeSAM

# 2. Quantification with RSEM
# Quantifies gene and transcript expression levels from the STAR-aligned transcriptome BAM.
# The output includes FPKM, TPM, and read counts for genes and isoforms.
rsem-calculate-expression --paired-end \
                          --num-threads 8 \
                          --bam ${OUTPUT_PREFIX}.STAR.Aligned.toTranscriptome.out.bam \
                          ${RSEM_REFERENCE_NAME} \
                          ${OUTPUT_PREFIX}.RSEM

View on GitHub

6

Tophat

TopHat v2.1.1 (Inferred with models/gemini-2.5-flash)

$ Bash example

# Install TopHat2 and its dependencies (example using conda)
# conda create -n tophat_env tophat2=2.1.1 bowtie2=2.3.4.3 samtools=1.9 -c bioconda -y
# conda activate tophat_env

# Define variables
# Placeholder for human genome (hg38) Bowtie2 index prefix.
# Ensure the directory '/path/to/genome/index/hg38/' contains Bowtie2 index files
# like hg38.1.bt2, hg38.2.bt2, etc.
GENOME_INDEX_PREFIX="/path/to/genome/index/hg38/hg38"

# Placeholder for human genome (hg38) GTF annotation file (e.g., Gencode v38)
GTF_FILE="/path/to/annotation/gencode.v38.annotation.gtf"

# Input FASTQ file(s)
# Assuming single-end reads for this example. For paired-end, use FASTQ_R1 and FASTQ_R2 variables.
FASTQ_FILE="input_eclip_reads.fastq.gz"
# For paired-end: FASTQ_R1="input_eclip_read1.fastq.gz"
# For paired-end: FASTQ_R2="input_eclip_read2.fastq.gz"

# Output directory for TopHat2 results
OUTPUT_DIR="tophat2_alignment_output"

# Number of threads to use for alignment
THREADS=8

# Create output directory if it doesn't exist
mkdir -p "${OUTPUT_DIR}"

# Execute TopHat2 for eCLIP alignment
# --library-type fr-firststrand is commonly used for eCLIP assays
tophat2 \
    --library-type fr-firststrand \
    --num-threads "${THREADS}" \
    --gtf "${GTF_FILE}" \
    --output-dir "${OUTPUT_DIR}" \
    "${GENOME_INDEX_PREFIX}" \
    "${FASTQ_FILE}"
# For paired-end: tophat2 ... "${GENOME_INDEX_PREFIX}" "${FASTQ_R1}" "${FASTQ_R2}"

7

Bowtie

Bowtie v2.4.2 GitHub

$ Bash example

# Install bowtie2 (e.g., using conda)
# conda install -c bioconda bowtie2=2.4.2

# Define variables (placeholders)
INPUT_FASTQ="input.fastq.gz" # Replace with your input FASTQ file
OUTPUT_SAM="output.sam"     # Replace with your desired output SAM file
THREADS=8                   # Number of threads to use
LOG_FILE="bowtie2.log"      # Log file for bowtie2 output
GENOME_INDEX_PREFIX="/path/to/your/genome/index/hg38" # Replace with the path to your Bowtie2 index prefix (e.g., for hg38)

# Example: Download and build hg38 index if not available (this is a placeholder, actual index building might be more complex)
# mkdir -p /path/to/your/genome/index
# wget -P /path/to/your/genome/index https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz
# gunzip /path/to/your/genome/index/hg38.fa.gz
# bowtie2-build /path/to/your/genome/index/hg38.fa /path/to/your/genome/index/hg38

# Run bowtie2 alignment for eCLIP assays
bowtie2 \
    -p "${THREADS}" \
    --local \
    --very-sensitive-local \
    -x "${GENOME_INDEX_PREFIX}" \
    -U "${INPUT_FASTQ}" \
    -S "${OUTPUT_SAM}" \
    2> "${LOG_FILE}"

View on GitHub

8

RSEM

RSEM v1.3.3 (Inferred with models/gemini-2.5-flash) GitHub

$ Bash example

# Install RSEM (if not already installed)
# conda install -c bioconda rsem

# --- Reference Data Preparation (Run once) ---
# RSEM requires an index built from a reference genome FASTA and a gene annotation GTF/GFF file.
# Example for human GRCh38 (hg38) using Gencode v38 annotation:

# # Create a directory for RSEM index files
# mkdir -p rsem_index_hg38
# cd rsem_index_hg38

# # Download reference genome FASTA (primary assembly)
# # wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/GRCh38.primary_assembly.genome.fa.gz
# # gunzip GRCh38.primary_assembly.genome.fa.gz

# # Download gene annotation GTF
# # wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.annotation.gtf.gz
# # gunzip gencode.v38.annotation.gtf.gz

# # Build the RSEM reference index
# # This step can take a significant amount of time and memory.
# # rsem-prepare-reference --gtf gencode.v38.annotation.gtf \
# #                      GRCh38.primary_assembly.genome.fa \
# #                      hg38_rsem_index

# --- RSEM Quantification Command ---

# Define input and output paths
# INPUT_BAM: Path to the alignment file (e.g., from STAR, HISAT2) in BAM format.
#            Assumes paired-end reads were aligned.
INPUT_BAM="/path/to/your/aligned_reads.bam"

# RSEM_INDEX: Path to the RSEM reference index directory and prefix created by rsem-prepare-reference.
#             e.g., if index was built as 'hg38_rsem_index' in '/path/to/rsem_index_hg38/', then it's '/path/to/rsem_index_hg38/hg38_rsem_index'
RSEM_INDEX="/path/to/rsem_index_hg38/hg38_rsem_index"

# OUTPUT_PREFIX: Prefix for all output files generated by RSEM (e.g., .genes.results, .isoforms.results, .stat).
OUTPUT_PREFIX="sample_quantification_results"

# Number of threads to use for parallel processing
NUM_THREADS=8

# Strandedness of the RNA-seq library. Common options:
#   --strandedness none (for unstranded libraries)
#   --strandedness forward (for dUTP, Ligation, Standard Illumina with first read sense to transcript)
#   --strandedness reverse (for dUTP, Ligation, Standard Illumina with first read antisense to transcript)
# If unsure, 'none' is a safe default but might reduce accuracy for stranded libraries.
STRANDEDNESS="none"

# Execute RSEM to calculate gene and isoform expression
rsem-calculate-expression \
    --bam \                         # Specify that input is a BAM file
    --paired-end \                  # Specify that reads are paired-end
    --no-qualities \                # Input BAM does not need quality scores for quantification
    --num-threads "${NUM_THREADS}" \ # Number of threads for parallel processing
    --strandedness "${STRANDEDNESS}" \ # Library strandedness
    "${INPUT_BAM}" \                # Input BAM file
    "${RSEM_INDEX}" \               # RSEM reference index
    "${OUTPUT_PREFIX}"              # Output file prefix

View on GitHub

Publication

Dataset

Processing Steps

Tools Used