Source code for harvdev_utils.char_conversions.sgml_to_unicode

"""Module:: sgml_to_unicode.

Synopsis:
    A module to convert FlyBase SGML to Greek characters in unicode.

Author(s):
    Christopher Tabone ctabone@morgan.harvard.edu

"""

import re


[docs]def sgml_to_unicode(input_string): r"""Convert FlyBase SGML to Greek characters in unicode. e.g. "&agr;" -> "\\u03B1" Args: arg1 (str): The "input_string" containing FB SGML characters to be converted. Returns: str: The same string as the input with the SGML characters converted to unicode. Raises: KeyError: If the regex matches for a set of SGML characters but there is no exact matching SGML. """ substitution_dict = { '&agr;': '\u03B1', '&Agr;': '\u0391', '&bgr;': '\u03B2', '&Bgr;': '\u0392', '&ggr;': '\u03B3', '&Ggr;': '\u0393', '&dgr;': '\u03B4', '&Dgr;': '\u0394', '&egr;': '\u03B5', '&Egr;': '\u0395', '&zgr;': '\u03B6', '&Zgr;': '\u0396', '&eegr;': '\u03B7', '&EEgr;': '\u0397', '&thgr;': '\u03B8', '&THgr;': '\u0398', '&igr;': '\u03B9', '&Igr;': '\u0399', '&kgr;': '\u03BA', '&Kgr;': '\u039A', '&lgr;': '\u03BB', '&Lgr;': '\u039B', '&mgr;': '\u03BC', '&Mgr;': '\u039C', '&ngr;': '\u03BD', '&Ngr;': '\u039D', '&xgr;': '\u03BE', '&Xgr;': '\u039E', '&ogr;': '\u03BF', '&Ogr;': '\u039F', '&pgr;': '\u03C0', '&Pgr;': '\u03A0', '&rgr;': '\u03C1', '&Rgr;': '\u03A1', '&sgr;': '\u03C3', '&Sgr;': '\u03A3', '&tgr;': '\u03C4', '&Tgr;': '\u03A4', '&ugr;': '\u03C5', '&Ugr;': '\u03A5', '&phgr;': '\u03C6', '&PHgr;': '\u03A6', '&khgr;': '\u03C7', '&KHgr;': '\u03A7', '&psgr;': '\u03C8', '&PSgr;': '\u03A8', '&ohgr;': '\u03C9', '&OHgr;': '\u03A9', '&lt;': '<', '&gt;': '>' } substitution = None try: substitution = re.sub(r'(&\w+;)', lambda m: substitution_dict[m.group()], input_string) except KeyError as e: print('Regex matched the sgml pattern &\\w+; but no key was found in the substitution dictionary.') print('Please check for typos in your sgml: {}'.format(e)) raise(e) return substitution