3
$\begingroup$

I have some gwas summary statistics in GRCh38 that I want to lift to GRCh37. I am trying to liftover in R using this code:

library(tidyverse)
library(magrittr)
library(data.table)
library(rtracklayer)
library(GenomicRanges)

rm(list=ls())

gwas_data <- fread("/gwas_sumstats_allchr.txt")
chain_file <- "/chain_files/hg38ToHg19.over.chain"
chain <- import.chain(chain_file)

# Convert to GRanges object (assuming GENPOS is 1-based)
gwas_ranges <- GRanges(
  seqnames = Rle(gwas_data$CHROM),
  ranges = IRanges(start = gwas_data$GENPOS - 1, end = gwas_data$GENPOS) # Convert to 0-based start
)

gwas_ranges$seqnames <- paste0("chr", gwas_ranges$seqnames)

# Perform the liftover
liftover_output <- liftOver(gwas_ranges, chain)

# Extract successful liftover
liftover_success <- liftover_output$`output`

# Convert the results back to a data frame (convert positions back to 1-based)
result <- data.frame(
  CHROM = as.character(seqnames(liftover_success)),
  GENPOS = start(liftover_success) + 1  # Convert back to 1-based position
)

# Print or save the result
print(head(result))

However this doesn't work and I am getting:

liftover_output <- liftOver(gwas_ranges, chain)
Discarding unchained sequences: 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1, 20, 21, 22, 2, 3, 4, 5, 6, 7, 8, 9

My chain file seems fine on checking it (downloaded from UCSC genome browser), I align my chromosomes to have "chr" like the 38 chain does, and my gwas data is indeed in GRCh38 - what else could be going wrong?

My gwas_ranges looks like:

head(gwas_ranges)
GRanges object with 6 ranges and 1 metadata column:
seqnames      ranges strand |    seqnames
<Rle>   <IRanges>  <Rle> | <character>
[1]       10 10708-10709      * |         chr
[2]       10 10904-10905      * |         chr
[3]       10 10942-10943      * |         chr
[4]       10 12112-12113      * |         chr
[5]       10 13850-13851      * |         chr
[6]       10 14537-14538      * |         chr
-------   seqinfo: 22 sequences from an unspecified genome; no seqlengths

I also tried doing the liftover in python using hail. This ran without error, but the liftover of SNPs on chromosome 10 position 1234 would lift to chromosome 18 12369 . Here's my hail attempt:

import hail as hl
hl.stop()
hl.init()

# Load the reference genomes and add the liftover chain file
rg38 = hl.get_reference("GRCh38")
rg37 = hl.get_reference("GRCh37")
local_chain_file = "/references_grch38_to_grch37.over.chain.gz" #downloaded from Hail's GCP
rg38.add_liftover(local_chain_file, rg37)

# Import GWAS data
gwas = hl.import_table('/gwas_sumstats_allchr.txt', 
                       types={'CHROM': 'str', 'GENPOS': 'int'})

# Standardize chromosome names in your GWAS data to match GRCh38
gwas = gwas.annotate(CHROM=hl.if_else(hl.literal('chr').contains(gwas.CHROM), gwas.CHROM, hl.literal('chr') + gwas.CHROM))

# Perform the liftover from GRCh38 to GRCh37
gwas = gwas.annotate(
    locus_grch38=hl.locus(gwas.CHROM, gwas.GENPOS, reference_genome='GRCh38'),
    locus_grch37=hl.liftover(hl.locus(gwas.CHROM, gwas.GENPOS, reference_genome='GRCh38'), 'GRCh37')
)

# Add new columns for CHROM_38 and GENPOS_38
gwas = gwas.annotate(
    CHROM_38=hl.if_else(hl.is_defined(gwas.locus_grch37), gwas.locus_grch37.contig, hl.null('str')),
    GENPOS_38=hl.if_else(hl.is_defined(gwas.locus_grch37), gwas.locus_grch37.position, hl.null('int'))
)

gwas = gwas.drop('locus_grch38', 'locus_grch37')

# Export the results
gwas.export('/gwas_sumstats_allchr_GRCh37.txt')

Here's some of the output I get from the hail run:

Head of gwas_sumstats_GRCh38txt:

CHROM   GENPOS  ID  ALLELE0 ALLELE1 A1FREQ  INFO    N   TEST    BETA    SE  CHISQ   LOG10P  p   MAF
10  10709   10:10709:G:A    G   A   0.0140284   0.420904    56210   ADD 0.143464    0.0384744   13.904  3.71581 0.000192393324862147    0.0140284
10  10905   10:10905:G:A    G   A   0.0352519   0.418554    56210   ADD -0.000990946    0.0246439   0.0016169   0.0141582   0.967925206925404   0.0352519
10  10943   10:10943:G:C    G   C   0.0653494   0.469298    56210   ADD -0.0130032  0.0173743   0.56013 0.342745    0.454208230910946   0.0653494
10  12113   10:12113:AC:A   AC  A   0.0390721   0.431839    56210   ADD 0.0386709   0.0230572   2.81293 1.02915 0.0935082652260488  0.0390721

Head of gwas_sumstats_GRCh37.txt:

CHROM   GENPOS  ID  A1FREQ  INFO    N   TEST    BETA    SE  CHISQ   LOG10P  p   MAF original_locus
18  10905   18:10905:G:A    0.0140284   0.420904    56210   ADD 0.143464    0.0384744   13.904  3.71581 0.000192393324862147    0.0140284   chr10:10709
18  11258   18:11258:G:A    0.0352519   0.418554    56210   ADD -0.000990946    0.0246439   0.0016169   0.0141582   0.967925206925404   0.0352519   chr10:10905
18  11296   18:11296:G:C    0.0653494   0.469298    56210   ADD -0.0130032  0.0173743   0.56013 0.342745    0.454208230910946   0.0653494   chr10:10943
18  12467   18:12467:AC:A   0.0390721   0.431839    56210   ADD 0.0386709   0.0230572   2.81293 1.02915 0.0935082652260488  0.0390721   chr10:12113
$\endgroup$
1
  • 1
    $\begingroup$ The BCFtools/liftover plugin here is exactly aimed at handling GWAS summary statistics and it will do a much better job at dealing with summary statistics for indels $\endgroup$ Commented Nov 20, 2023 at 15:47

1 Answer 1

2
$\begingroup$

Thanks for your post, I tried your code in R with my GwasSumstats with some adjustments from https://bioconductor.org/packages/release/workflows/vignettes/liftOver/inst/doc/liftov.html and it works. So, here is mine :

library(tidyverse)
library(magrittr)
library(data.table)
library(rtracklayer)
library(GenomicRanges)

meta <- fread("OUTCOME_FILES.txt")
ch = import.chain("hg19ToHg38.over.chain")
gwas_ranges <- GRanges(
  seqnames = Rle(meta$Chr),
  ranges = IRanges(start = meta$Start, end = meta$Start), #Didn't have the end info
  strand = rep("*",length(meta$Chr)),
  DISEASE = rep("RPVO",length(meta$Chr)),
  SNPS = meta$SNP.y
)

# I think this step is the most important one
seqlevelsStyle(gwas_ranges) = "UCSC"


# Perform the liftover
liftover_output <- liftOver(gwas_ranges, ch)
liftover_success <- attributes(liftover_output)$unlistData
result <- data.frame(
  CHROM = as.character(seqnames(liftover_success)),
  GENPOS = start(liftover_success),
  SNP=liftover_success$SNPS
)

Here is the head of my "meta" data :

     Chr  Start       SNP.y
   <int>  <int>      <char>
1:     1 636285 rs545945172
2:     1 649192 rs201942322
3:     1 662622  rs61769339
4:     1 693731  rs12238997
5:     1 693823  rs61769351
6:     1 707522 rs371890604

Here is the head of my "liftover_output" :

GRangesList object of length 6631104:
[[1]]
GRanges object with 1 range and 2 metadata columns:
      seqnames    ranges strand |     DISEASE        SNPS
         <Rle> <IRanges>  <Rle> | <character> <character>
  [1]     chr1    700905      * |        RPVO rs545945172
  -------
  seqinfo: 23 sequences from an unspecified genome; no seqlengths

[[2]]
GRanges object with 1 range and 2 metadata columns:
      seqnames    ranges strand |     DISEASE        SNPS
         <Rle> <IRanges>  <Rle> | <character> <character>
  [1]     chr1    713812      * |        RPVO rs201942322
  -------
  seqinfo: 23 sequences from an unspecified genome; no seqlengths

[[3]]
GRanges object with 1 range and 2 metadata columns:
      seqnames    ranges strand |     DISEASE        SNPS
         <Rle> <IRanges>  <Rle> | <character> <character>
  [1]     chr1    727242      * |        RPVO  rs61769339
  -------
  seqinfo: 23 sequences from an unspecified genome; no seqlengths

...
<6631101 more elements>

Here is the head of my "liftover_success" :

GRanges object with 6624136 ranges and 2 metadata columns:
            seqnames    ranges strand |     DISEASE        SNPS
               <Rle> <IRanges>  <Rle> | <character> <character>
        [1]     chr1    700905      * |        RPVO rs545945172
        [2]     chr1    713812      * |        RPVO rs201942322
        [3]     chr1    727242      * |        RPVO  rs61769339
        [4]     chr1    758351      * |        RPVO  rs12238997
        [5]     chr1    758443      * |        RPVO  rs61769351
        ...      ...       ...    ... .         ...         ...
  [6624132]    chr22  50794872      * |        RPVO   rs9616839
  [6624133]    chr22  50794884      * |        RPVO  rs62240043
  [6624134]    chr22  50794919      * |        RPVO  rs62240044
  [6624135]    chr22  50797585      * |        RPVO        <NA>
  [6624136]    chr22  50798635      * |        RPVO   rs3896457
  -------
  seqinfo: 23 sequences from an unspecified genome; no seqlengths

Here is the head oh the "result" data.frame :

 CHROM GENPOS         SNP
1  chr1 700905 rs545945172
2  chr1 713812 rs201942322
3  chr1 727242  rs61769339
4  chr1 758351  rs12238997
5  chr1 758443  rs61769351
6  chr1 772142 rs371890604

My session Info if needed :

R version 4.1.0 (2021-05-18)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /module/apps/miniconda/python3/envs/R4.1/lib/libopenblasp-r0.3.15.so

locale:
 [1] LC_CTYPE=fr_FR.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=fr_FR.UTF-8        LC_COLLATE=fr_FR.UTF-8    
 [5] LC_MONETARY=fr_FR.UTF-8    LC_MESSAGES=fr_FR.UTF-8   
 [7] LC_PAPER=fr_FR.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=fr_FR.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] gwascat_2.24.0       rtracklayer_1.52.1   GenomicRanges_1.44.0
 [4] GenomeInfoDb_1.28.4  IRanges_2.26.0       S4Vectors_0.30.2    
 [7] BiocGenerics_0.38.0  tibble_3.2.1         stringr_1.5.1       
[10] OpenRepGrid_0.1.14   plinkbinr_0.0.0.9000 ieugwasr_0.1.5      
[13] dplyr_1.1.4          TwoSampleMR_0.5.7    data.table_1.15.2   

loaded via a namespace (and not attached):
 [1] nlme_3.1-163                bitops_1.0-7               
 [3] matrixStats_1.2.0           bit64_4.0.5                
 [5] filelock_1.0.3              progress_1.2.2             
 [7] httr_1.4.7                  tools_4.1.0                
 [9] utf8_1.2.4                  R6_2.5.1                   
[11] DBI_1.2.2                   colorspace_2.1-0           
[13] withr_3.0.0                 tidyselect_1.2.0           
[15] prettyunits_1.2.0           mnormt_2.1.1               
[17] bit_4.0.5                   curl_5.2.0                 
[19] compiler_4.1.0              cli_3.6.2                  
[21] Biobase_2.52.0              xml2_1.3.5                 
[23] DelayedArray_0.18.0         psych_2.3.9                
[25] readr_2.1.4                 rappdirs_0.3.3             
[27] digest_0.6.34               Rsamtools_2.8.0            
[29] R.utils_2.12.2              XVector_0.32.0             
[31] base64enc_0.1-3             pkgconfig_2.0.3            
[33] htmltools_0.5.6.1           MatrixGenerics_1.4.3       
[35] dbplyr_2.4.0                fastmap_1.1.1              
[37] BSgenome_1.60.0             htmlwidgets_1.6.2          
[39] rlang_1.1.3                 RSQLite_2.3.5              
[41] BiocIO_1.2.0                generics_0.1.3             
[43] jsonlite_1.8.8              vroom_1.6.4                
[45] BiocParallel_1.26.2         R.oo_1.25.0                
[47] VariantAnnotation_1.38.0    RCurl_1.98-1.12            
[49] magrittr_2.0.3              GenomeInfoDbData_1.2.6     
[51] pvclust_2.2-0               Matrix_1.6-5               
[53] Rcpp_1.0.12                 fansi_1.0.6                
[55] abind_1.4-5                 lifecycle_1.0.4            
[57] R.methodsS3_1.8.2           stringi_1.8.3              
[59] yaml_2.3.7                  SummarizedExperiment_1.22.0
[61] zlibbioc_1.38.0             plyr_1.8.9                 
[63] BiocFileCache_2.10.1        grid_4.1.0                 
[65] blob_1.2.4                  snpStats_1.42.0            
[67] crayon_1.5.2                lattice_0.22-5             
[69] splines_4.1.0               Biostrings_2.60.2          
[71] GenomicFeatures_1.44.2      hms_1.1.3                  
[73] KEGGREST_1.32.0             knitr_1.45                 
[75] pillar_1.9.0                rjson_0.2.21               
[77] biomaRt_2.48.3              XML_3.99-0.14              
[79] glue_1.7.0                  BiocManager_1.30.22        
[81] vctrs_0.6.5                 png_0.1-8                  
[83] tzdb_0.4.0                  cachem_1.0.8               
[85] xfun_0.40                   restfulr_0.0.15            
[87] survival_3.5-7              GenomicAlignments_1.28.0   
[89] AnnotationDbi_1.54.1        memoise_2.0.1              
[91] rgl_1.2.8 
$\endgroup$

Not the answer you're looking for? Browse other questions tagged or ask your own question.