library(stringr)
library(progress)
library(rentrez)
library(openxlsx)
library(ape)
library(dplyr)
setwd("./")
cafe_count <- read.csv("/Gamma_count.tab", sep = "\t")
cafe_fam <- read.csv("/Gamma_family_results.txt", sep = "\t")
sig_cafe_fam <- cafe_fam[cafe_fam$pvalue < 0.05,]
head(cafe_count)
ortho_count <- read.csv("./Orthogroups.GeneCount.tsv", sep = "\t")
head(ortho_count)
ortho_group <- read.csv("./Orthogroups.tsv", sep = "\t")
head(ortho_group)
namecon <- read.xlsx("./convert_name.xlsx")
summary_dt <- readRDS("./summary_cafe_19.rds")
#write.xlsx(summary_dt, "./summary_cafe_19.xlsx")
#check the ortholog group to functions
sig_summary_dt <- summary_dt %>% dplyr::filter(p_agar < 0.05)
#togetfunc <- ortho_group %>% dplyr::filter(Orthogroup %in% sig_summary_dt$Orthogroup)
colnames(ortho_group) <- gsub("\\.protein|_protein", "", colnames(ortho_group))
togetfunc <- merge(sig_summary_dt, ortho_group, by.x = "Orthogroup", by.y = "Orthogroup", all.x = T, all.y = F)
#get functions from gff
dim(togetfunc)
alltogetfunc <- merge(summary_dt, ortho_group, by.x = "Orthogroup", by.y = "Orthogroup", all.x = T, all.y = F)
allgff <- list.files("./genome/gff/")
allgff <- allgff[grepl("gz", allgff)]
sum(gsub("\\.repaeat*|\\.gene.*|_genomic.*", "", allgff) %in% namecon$ncbi)
setdiff(gsub("\\.gene.*|_genomic.*", "", allgff), namecon$ncbi) #all gff match with the namecon ncbi
for(j in 15+(0:(ncol(alltogetfunc)-15))){
#j = 16
colnames(alltogetfunc)[j]
species <- colnames(alltogetfunc)[j]
seqid_row <- c()
func_row <- c()
prod_row <- c()
locus_tag_row <- c()
prot_row <- c()
uniprot_row <- c()
go_row <- c()
goid_row <- c()
gobp_row <- c()
for(k in 1:nrow(alltogetfunc)){
#k = 1
if(file.exists(paste0("./genome/uniprot/touniprot/manual/uniprot_result/", species, ".xlsx"))){
#k = 4 #each row
eachgff <- read.gff(paste0("./genome/gff/", allgff[grep(species, allgff)]))
tosearch <- unlist(str_split(alltogetfunc[k,colnames(alltogetfunc)[j]], ", "))
outgff <- eachgff %>% dplyr::filter(grepl(paste(tosearch, collapse = "|"), attributes)) %>% dplyr::select(seqid, attributes) %>%
dplyr::filter(!duplicated(seqid)) %>%
dplyr::mutate(functions = str_extract(attributes, "Note=[^;]+"),
functions = str_replace(functions, "Note=", ""),
product = str_extract(attributes, "product=[^;]+"),
product = str_replace(product, "product=", ""),
ID = gsub("ID=|\\;.*|\\:.*|\\.v5.*|\\.t1.*|gene_id |\"", "", attributes),
Name = str_extract(attributes, "Name=[^;]+"),
Name = str_replace(Name, "Name=", ""),
gene_id = str_extract(attributes, "gene_id [^;]+"),
gene_id = gsub("\"", "", str_replace(gene_id, "gene_id ", "")),
locus_tag = gsub("locus_tag=", "", str_extract(attributes, "locus_tag=[^;]+"))
)
if(any(grepl("chr", outgff$seqid))){
seqid_out <- paste(unique(na.omit(outgff$ID)), collapse = ", ")
}else{
seqid_out <- paste(unique(na.omit(outgff$seqid)), collapse = ", ")
}
func_out <- paste(unique(na.omit(outgff$functions)), collapse = ", ")
prod_out <- paste(unique(na.omit(outgff$product)), collapse = ", ")
locus_tag_out <- paste(unique(na.omit(outgff$locus_tag)), collapse = ", ")
#match go terms
outgff$locus_tag
eachmanual <- openxlsx::read.xlsx(paste0("./genome/uniprot/touniprot/protogene/", species, ".xlsx"), sep = "\t")
eachconversion <- openxlsx::read.xlsx(paste0("./genome/uniprot/touniprot/manual/uniprot_result/", species, ".xlsx"))
togo <- eachconversion %>% dplyr::filter(Gene.Names %in% outgff$locus_tag)
prot_out <- paste(unique(unlist(stringr::str_split(togo$Protein.names, "; "))), collapse = "; ")
uniprot_out <- paste(unique(unlist(stringr::str_split(togo$Entry, "; "))), collapse = "; ")
go_out <- paste(unique(unlist(stringr::str_split(togo$`Gene.Ontology.(GO)`, "; "))), collapse = "; ")
goid_out <- paste(unique(unlist(stringr::str_split(togo$Gene.Ontology.IDs, "; "))), collapse = "; ")
gobp_out <- paste(unique(unlist(stringr::str_split(togo$`Gene.Ontology.(biological.process)`, "; "))), collapse = "; ")
}else{
prot_out <- NA
uniprot_out <- NA
go_out <- NA
goid_out <- NA
gobp_out <- NA
seqid_out <- NA
func_out <- NA
prod_out <- NA
locus_tag_out <- NA
}
#####add go terms
seqid_row <- c(seqid_row, seqid_out)
func_row <- c(func_row, func_out)
prod_row <- c(prod_row, prod_out)
locus_tag_row <- c(locus_tag_row, locus_tag_out)
prot_row <- c(prot_row, prot_out)
uniprot_row <- c(uniprot_row, uniprot_out)
go_row <- c(go_row, go_out)
goid_row <- c(goid_row, goid_out)
gobp_row <- c(gobp_row, gobp_out)
print(paste(j, k, func_out, prod_out))
}
alltogetfunc[,paste("seqid", colnames(alltogetfunc)[j], collapse = "_")] <- seqid_row
alltogetfunc[,paste("function", colnames(alltogetfunc)[j], collapse = "_")] <- func_row
alltogetfunc[,paste("product", colnames(alltogetfunc)[j], collapse = "_")] <- prod_row
alltogetfunc[,paste("locus_tag", colnames(alltogetfunc)[j], collapse = "_")] <- locus_tag_row
alltogetfunc[,paste("prot", colnames(alltogetfunc)[j], collapse = "_")] <- prot_row
alltogetfunc[,paste("uniprot", colnames(alltogetfunc)[j], collapse = "_")] <- uniprot_row
alltogetfunc[,paste("go", colnames(alltogetfunc)[j], collapse = "_")] <- go_row
alltogetfunc[,paste("goid", colnames(alltogetfunc)[j], collapse = "_")] <- goid_row
alltogetfunc[,paste("gobp", colnames(alltogetfunc)[j], collapse = "_")] <- gobp_row
}
write_rds(alltogetfunc, "./cafe/orthogenefunction_go_19.rds")
write.xlsx(alltogetfunc, "./cafe/orthogenefunction_go_19.xlsx")
#blast result
blastp_path <- "./genome/toblast/out_blastp/"
blastp_file <- list.files(blastp_path)
# Define filter criteria
evalue_threshold <- 1e-5
pident_threshold <- 30
qcovs_threshold <- 50
all_filtered_blast_df <- c()
for(i in 1:length(blastp_file)){
#i = 1
blast_df <- read.table(paste0("./genome/toblast/out_blastp/",blastp_file[i]), sep = "\t", header = FALSE)
# Assign headers to the data frame
colnames(blast_df) <- c("query_id", "subject_id", "pident", "length", "mismatch",
"gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qcovs")
# Apply filters
filtered_blast_df <- subset(blast_df, evalue <= evalue_threshold & pident >= pident_threshold & qcovs >= qcovs_threshold) %>%
group_by(query_id) %>%
slice_max(pident, with_ties = FALSE)
filtered_blast_df$species <- gsub(".txt", "", blastp_file[i])
all_filtered_blast_df <- rbind(all_filtered_blast_df, filtered_blast_df)
dim(filtered_blast_df[!duplicated(filtered_blast_df$query_id),])
}
all_filtered_blast_df %>% dplyr::group_by(species) %>% dplyr::summarise(count = n())
all(all_filtered_blast_df[all_filtered_blast_df$species %in% "GCA_022539475.1_USP_GDom_1.0",]$query_id %in% blast_df$query_id)
all_filtered_blast_df$Species <- namecon[match(all_filtered_blast_df$species, namecon$ncbi),]$species
all_filtered_blast_df %>% filter(Species != "Gracilaria_domingensis") %>% distinct(query_id) %>% dplyr::summarise(n = n())
all_filtered_blast_df %>% filter(Species != "Gracilaria_domingensis") %>% dplyr::distinct(Species) %>% dplyr::summarise(n = n())
length(unique(all_filtered_blast_df[!(all_filtered_blast_df$Species %in% "Gracilaria_domingensis"),]$query_id))
length(unique(all_filtered_blast_df[!(all_filtered_blast_df$Species %in% "Gracilaria_domingensis"),]$Species))
#ortho with blast query from GDom
head(ortho_group)
colnames(ortho_group)
ortho_group$GCA_022539475.1_USP_GDom_1.0
blast_dom <- read.table(paste0("./genome/toblast/out_blastp/GCA_022539475.1_USP_GDom_1.0.txt"), sep = "\t", header = FALSE)
colnames(blast_dom) <- c("query_id", "subject_id", "pident", "length", "mismatch",
"gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qcovs")
filtered_dom <- subset(blast_dom, evalue <= evalue_threshold & pident >= pident_threshold & qcovs >= qcovs_threshold) %>%
group_by(query_id) %>%
slice_max(pident, with_ties = FALSE)
dim(filtered_dom)
setdiff(blast_df$query_id, filtered_dom$query_id)
filtered_dom_three <- blast_dom %>% dplyr::filter(query_id %in% setdiff(blast_df$query_id, filtered_dom$query_id)) %>%
group_by(query_id) %>%
slice_max(pident, with_ties = FALSE)
dom_dt <- rbind.data.frame(filtered_dom, filtered_dom_three)
dom_dt <- dom_dt %>% dplyr::mutate(fullname = query_id) %>% tidyr::separate(query_id, sep = "\\.\\.", into = c("gene_oi", "locus_tag"))
ortho_group$GCA_022539475.1_USP_GDom_1.0
grep(paste(dom_dt$subject_id, collapse = "|"), ortho_group$GCA_022539475.1_USP_GDom_1.0)
gene_of_interest <- c()
locus_of_interest <- c()
fullname_of_interest <- c()
for(i in 1:nrow(ortho_group)){
#i = 971
print(i/nrow(ortho_group))
ofin <- ortho_group[i,]$GCA_022539475.1_USP_GDom_1.
ofinc <- unlist(strsplit(ofin, ", "))
sel_dom <- dom_dt[dom_dt$subject_id %in% ofinc,]
g_oi <- paste(unique(sel_dom$gene_oi), collapse = ", ")
lt_oi <- paste(unique(sel_dom$locus_tag), collapse = ", ")
fn_oi <- paste(unique(sel_dom$fullname), collapse = ", ")
if(nrow(sel_dom)>0){
gene_of_interest <- c(gene_of_interest, g_oi)
locus_of_interest <- c(locus_of_interest, lt_oi)
fullname_of_interest <- c(fullname_of_interest, fn_oi)
}else{
gene_of_interest <- c(gene_of_interest, NA)
locus_of_interest <- c(locus_of_interest, NA)
fullname_of_interest <- c(fullname_of_interest, NA)
}
}
updated_ortho_group <- cbind.data.frame(ortho_group, gene_of_interest, locus_of_interest, fullname_of_interest)
poly_ortho_group <- updated_ortho_group[!is.na(updated_ortho_group$gene_of_interest),]
write.xlsx(poly_ortho_group, "./polysaccharide_ortho_group.xlsx")
poly_ortho_group %>% left_join(summary_dt, by = "Orthogroup")
poly_ortho_group_join <- summary_dt %>% right_join(poly_ortho_group, by = "Orthogroup")
poly_ortho_group_join[1:5,1:5]
write.xlsx(poly_ortho_group_join, "./polysaccharide_ortho_group_join.xlsx")