Source code for bioflow.bio_db_parsers.geneOntologyParser
"""
Contains the functions responsible for the parsing of the GO terms
"""
from bioflow.utils.log_behavior import get_logger
log = get_logger(__name__)
[docs]class GOTermsParser(object):
""" Wrapper object for a parser of GO terms."""
def __init__(self):
self.go_terms = {}
# {id:
# {'id':''
# 'name':''
# 'def':''
# 'namespace':''}}
#
# Ignored
# ['subset', => ignore
# 'comment', => ignore
# 'exact_synonym',=> Ignore
# 'consider', => ignore
# 'related_synonym', => ignore
# 'narrow_synonym', => ignore
# 'broad_synonym', => ignore
# 'replaced_by', => ignore
# 'alt_id', => ignore
# 'xref_analog'] => ignore
self.go_terms_structure = []
# [(Node1, relation, Node2)]
# Where relation in:
# 'is_a'
# 'relationship',
# 'part_of'
# 'regulates'
# 'positively_regulates'
# 'negatively_regulates'
self.local_dictionary = {}
self.local_relations = []
self.blocks = 0
self.block = False
self.obsolete = False
[docs] def start_block(self):
""" resets temporary stores to fill so that a new term can be loaded """
self.blocks += 1
self.block = True
self.obsolete = False
self.local_dictionary = {}
self.local_relations = []
[docs] def parse_line_in_block(self, header, payload):
"""
Parses a line within GO term parameters block
:param header: GO term parameter name
:param payload: GO term parameter value
"""
if 'CHEBI:' in payload:
self.local_dictionary['CHEBI'] = payload.split(
'CHEBI:')[1].split(',')[0].split(']')[0]
if header == 'id':
self.local_dictionary[header] = payload.split(':')[1]
if header in ['name', 'namespace', 'def']:
self.local_dictionary[header] = payload
if header == 'is_a':
payload = payload.split('!')[0].split(':')[1].strip()
self.local_relations.append(
(self.local_dictionary['id'], header, payload))
if header == 'is_obsolete':
self.obsolete = True
if header == 'relationship':
header = str(payload.split()[0].strip())
payload = str(payload.split()[1].strip().split(':')[1])
self.local_relations.append(
(self.local_dictionary['id'], header, payload))
[docs] def flush_block(self):
""" flushes all temporary term stores to the main data stores """
self.block = False
if not self.obsolete and self.local_dictionary:
self.go_terms[self.local_dictionary['id']] = self.local_dictionary
self.go_terms_structure = self.go_terms_structure + self.local_relations
[docs] def parse_go_terms(self, source_file_path):
"""
Takes the path to the gene ontology .obo file and returns result of parse dict and list
:param source_file_path: gene ontology .obo file
:return: dict containing term parse, list containing inter-term relationship (turtle)
triplets
"""
with open(source_file_path, "rt") as go_terms_source:
for line in go_terms_source:
if line == '[Term]\n':
self.start_block()
elif self.block and line == '\n':
self.flush_block()
elif self.block:
try:
header = line.split(': ')[0].strip()
payload = line.split(': ')[1].strip()
self.parse_line_in_block(header, payload)
except IndexError:
log.error("Line '%s' violates obo conventions. %s ",
line, " Please check file integrity")
return self.go_terms, self.go_terms_structure