Problem in merging a gff file and a csv file in R

49 views Asked by At

I have a gff file and a csv file which looks like:

# CSV dataframe
file.csv <- read.table(text = "Sample  Name    Estimate    Std.Err P.Adjust
Sample_1    B005300.2.1 0.345930183 0.05662846  1.58E-06
Sample_1    B005230.2.1 0.048159129 0.013862871 0.019181546
Sample_1    B006450.2.1 -0.263951161    0.079297432 0.027327576
Sample_2    B005230.2.1 39.04308043 11.23861018 0.019181546
Sample_2    B006260.1.1 0.003968994 0.00063087  6.12E-07
Sample_2    B006170.2.1 0.117171563 0.024018888 0.000272761
Sample_3    B006450.2.1 0.012033053 0.003670908 0.030632664
Sample_3    B006980.1-c2.1  -0.007653796    0.002047582 0.009944649
Sample_3    B006980.1.1 -0.011369481    0.002871014 0.00539717", header = TRUE)

# GFF GRanges, example data
#dput(head(GFF))
GFF <- new("GRanges", seqnames = new("Rle", values = structure(1L, .Label = c("Bch01", "Bch02", "Bch03", "Bch04", "Bch05"), class = "factor"), lengths = 6L, 
    elementMetadata = NULL, metadata = list()), ranges = new("IRanges", 
    start = c(21882L, 21882L, 21882L, 21882L, 22697L, 22697L), 
    width = c(126L, 126L, 126L, 126L, 60L, 60L), NAMES = NULL, 
    elementType = "ANY", elementMetadata = NULL, metadata = list()), 
    strand = new("Rle", values = structure(2L, .Label = c("+", 
    "-", "*"), class = "factor"), lengths = 6L, elementMetadata = NULL, 
        metadata = list()), seqinfo = new("Seqinfo", seqnames = c("Bch01", "Bch02", "Bch03", "Bch04", "Bch05"), seqlengths = c(NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_), is_circular = c(NA, NA, NA, NA, NA), genome = c(NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_)), elementMetadata = new("DFrame", rownames = NULL, nrows = 6L, 
        listData = list(source = structure(c(1L, 1L, 1L, 1L, 
        1L, 1L), .Label = "maker", class = "factor"), type = structure(c(1L, 
        2L, 3L, 4L, 1L, 2L), .Label = c("CDS", "exon", "gene", 
        "mRNA", "three_prime_UTR", "five_prime_UTR"), class = "factor"), 
            score = c(NA, NA, NA, 126, NA, NA), phase = c(0L, 
            NA, NA, NA, 0L, NA), ID = c("B024400.1.1:cds", 
            "B024400.1.1:exon:2", "B024400.1", 
            "B024400.1.1", "B008910.1.1:cds", 
            "B008910.1.1:exon:4"), Parent = new("CompressedCharacterList", 
                elementType = "character", elementMetadata = NULL, 
                metadata = list(), unlistData = c("B024400.1.1", 
                "B024400.1.1", "B024400.1", 
                "B008910.1.1", "B008910.1.1"
                ), partitioning = new("PartitioningByEnd", end = c(1L, 
                2L, 2L, 3L, 4L, 5L), NAMES = NULL, elementType = "ANY", 
                  elementMetadata = NULL, metadata = list())), 
            Name = c(NA, NA, "B024400.1", "B024400.1.1", 
            NA, NA), Note = new("CompressedCharacterList", elementType = "character", 
                elementMetadata = NULL, metadata = list(), unlistData = c("Similar to B024400.1.1: LOW QUALITY:50S ribosomal protein L4, chloroplastic", "Similar to B024400.1.1: LOW QUALITY:50S ribosomal protein L4, chloroplastic"), partitioning = new("PartitioningByEnd", end = c(0L, 
                0L, 1L, 2L, 2L, 2L), NAMES = NULL, elementType = "ANY", 
                  elementMetadata = NULL, metadata = list())), 
            ref_id = c(NA, NA, "B024400.1.1", "B024400.1.1", 
            NA, NA), Dbxref = new("CompressedCharacterList", 
                elementType = "character", elementMetadata = NULL, 
                metadata = list(), unlistData = character(0), 
                partitioning = new("PartitioningByEnd", end = c(0L, 
                0L, 0L, 0L, 0L, 0L), NAMES = NULL, elementType = "ANY", 
                  elementMetadata = NULL, metadata = list())), 
            Ontology_term = new("CompressedCharacterList", elementType = "character", 
                elementMetadata = NULL, metadata = list(), unlistData = character(0), 
                partitioning = new("PartitioningByEnd", end = c(0L, 
                0L, 0L, 0L, 0L, 0L), NAMES = NULL, elementType = "ANY", 
                  elementMetadata = NULL, metadata = list()))), 
        elementType = "ANY", elementMetadata = NULL, metadata = list()), 
    elementType = "ANY", metadata = list())

I want to merge both the files by column Name. I tried:

GFF = rtracklayer::import("gene_models.gff")
merge_data<-merge(file.csv,GFF,by="Name")

But in the csv file, I have same Name values for different Samples such as B005230.2.1 for both Sample1 and Sample2 whereas it only occurs in GFF file once. Because of this, the merge file is messed up. I will appreciate any help to fix this problem. Thank you!

0

There are 0 answers