Source code for hepdata.modules.inspire_api.parser

# This file is part of HEPData.
# Copyright (C) 2016 CERN.
#
# HEPData is free software; you can redistribute it
# and/or modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# HEPData is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with HEPData; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
# MA 02111-1307, USA.
#
# In applying this license, CERN does not
# waive the privileges and immunities granted to it by virtue of its status
# as an Intergovernmental Organization or submit itself to any jurisdiction.

"""Functions for parsing the new INSPIRE JSON metadata."""

from copy import deepcopy

parsed_content_defaults = {
    'title': None,
    'doi': None,
    'authors': None,
    'type': [],
    'abstract': 'None',
    'creation_date': None,
    'arxiv_id': None,
    'collaborations': [],
    'keywords': [],
    'journal_info': 'No Journal Information',
    'year': None,
    'subject_area': [],
}



[docs]
def get_title(metadata):
    """Get the title of the publication from the first value in list of english translations (if applicable)
    otherwise the first title in the list of titles or preferably the first arXiv title in the list."""
    title = deepcopy(parsed_content_defaults['title'])
    if 'title_translations' in metadata.keys():
        for title_translation in metadata['title_translations']:
            if title_translation['language'] == 'en':
                title = title_translation['title']
    if title is parsed_content_defaults['title'] and 'titles' in metadata.keys() and len(metadata['titles']) > 0:
        title = metadata['titles'][0]['title']
        for _title in metadata['titles']:
            if 'title' in _title.keys() and 'source' in _title.keys() and _title['source'] in ['arXiv', 'submitter']:
                title = _title['title']
                break
    return title




[docs]
def get_doi(metadata):
    """Get the DOI of the journal publication from the first value in the list of DOIs."""
    doi = deepcopy(parsed_content_defaults['doi'])
    if 'dois' in metadata and len(metadata['dois']) > 0:
        doi = metadata['dois'][0]['value']
    return doi




[docs]
def get_authors(metadata):
    """Get the authors of the publication as a list of dictionaries with keys 'affiliation' and 'full_name'."""
    authors = deepcopy(parsed_content_defaults['authors'])
    if 'authors' in metadata.keys():
        authors = [{'affiliation': (author['affiliations'][0]['value'] if 'affiliations' in author.keys() else ''),
                    'full_name': author['full_name']}
                   for author in metadata['authors']]
    return authors




[docs]
def get_type(metadata):
    """Get the type of the publication."""
    _type = deepcopy(parsed_content_defaults['type'])
    if 'document_type' in metadata.keys():
        _type = metadata['document_type']
    return _type




[docs]
def get_abstract(metadata):
    """Get the abstract of the publication, ideally the one from the arXiv version, otherwise the first one."""
    abstract = deepcopy(parsed_content_defaults['abstract'])
    if 'abstracts' in metadata.keys():
        abstract = metadata['abstracts'][0]['value']
        for _abstract in metadata['abstracts']:
            if 'value' in _abstract.keys() and 'source' in _abstract.keys() and _abstract['source'] in ['arXiv', 'submitter']:
                abstract = _abstract['value']
                break
    return abstract




[docs]
def get_creation_date(metadata):
    """Get the creation date of the publication, first try to expand the preprint_date, otherwise try legacy_creation_date."""
    creation_date = deepcopy(parsed_content_defaults['creation_date'])
    if 'preprint_date' in metadata.keys():
        creation_date = expand_date(metadata['preprint_date'])
    elif 'legacy_creation_date' in metadata:
        creation_date = metadata['legacy_creation_date']
    return creation_date




[docs]
def get_arxiv_id(metadata):
    """Get the arxiv id of the publication from the first value in the list of arxiv eprints."""
    arxiv_id = deepcopy(parsed_content_defaults['arxiv_id'])
    if 'arxiv_eprints' in metadata.keys():
        arxiv_id = 'arXiv:' + metadata['arxiv_eprints'][0]['value']
    return arxiv_id




[docs]
def get_collaborations(metadata):
    """Get the collaborations of the publication as a list."""
    collaborations = deepcopy(parsed_content_defaults['collaborations'])
    if 'collaborations' in metadata:
        collaborations = [collaboration['value'] for collaboration in metadata['collaborations']]
    return collaborations




[docs]
def get_keywords(metadata):
    """Get the keywords of the publication."""
    keywords = deepcopy(parsed_content_defaults['keywords'])
    if 'keywords' in metadata.keys():
        keywords = metadata['keywords']
    return keywords




[docs]
def get_journal_info(metadata):
    """
    Get the journal information of the publication. Format is 'title volume (year) article page_start-page_end' if at least one of these information is available,
    otherwise attempt to obtain it from 'pubinfo_freetext' or 'publication_info' or 'report_numbers' or 'public_notes'. Defaults to 'No Journal Information'.
    """
    default_journal_info, journal_info = deepcopy(parsed_content_defaults['journal_info']), ''
    if 'publication_info' in metadata:
        if 'journal_title' in metadata['publication_info'][0].keys():
            journal_info += metadata['publication_info'][0]['journal_title'] + ' '
        if 'journal_volume' in metadata['publication_info'][0].keys():
            journal_info += metadata['publication_info'][0]['journal_volume'] + ' '
        if 'year' in metadata['publication_info'][0].keys():
            journal_info += '(' + str(metadata['publication_info'][0]['year']) + ') '
        if 'artid' in metadata['publication_info'][0].keys():
            journal_info += metadata['publication_info'][0]['artid'] + ' '
        if 'page_start' in metadata['publication_info'][0].keys() and 'page_end' in metadata['publication_info'][0].keys():
            journal_info += metadata['publication_info'][0]['page_start'] + "-" + metadata['publication_info'][0]['page_end']
        if journal_info != '':
            journal_info = journal_info.strip()  # trim to remove whitespace
            return journal_info
    if ('publication_info' in metadata and len(metadata['publication_info']) > 0 and type(metadata['publication_info'][0]) is dict and
       'pubinfo_freetext' in metadata['publication_info'][0].keys()):
        journal_info = metadata['publication_info'][0]['pubinfo_freetext']
    elif 'report_numbers' in metadata and len(metadata['report_numbers']) > 0:
        journal_info = metadata['report_numbers'][0]['value']
    elif ('public_notes' in metadata.keys() and any(['value' in public_note.keys() and "Submitted to " in public_note['value'] for public_note in metadata['public_notes']])):
        journal_info = [public_note['value'].replace("Submitted to ", "") for public_note in metadata['public_notes'] if
                        ('value' in public_note.keys() and "Submitted to " in public_note['value'])][0]
    if '. All figures' in journal_info:
        journal_info = journal_info.replace('. All figures', '')
    if journal_info != '':
        return journal_info
    else:
        return default_journal_info




[docs]
def get_year(metadata):
    """Get the year of the publication. Try first 'imprints/date', then 'publication_info/year', then 'preprint_date', and finally 'legacy_creation_date'."""
    year = deepcopy(parsed_content_defaults['year'])
    if 'imprints' in metadata.keys() and any(['date' in imprint.keys() and len(imprint['date']) == 4 for imprint in metadata['imprints']]):
        year = [imprint['date'] for imprint in metadata['imprints'] if 'date' in imprint.keys() and len(imprint['date']) == 4][0]
    elif ('publication_info' in metadata and 'year' in metadata['publication_info'][0].keys()):
        year = str(metadata['publication_info'][0]['year'])
    elif 'preprint_date' in metadata.keys():
        year = metadata['preprint_date'].split("-")[0]
    elif 'legacy_creation_date' in metadata:
        year = metadata['legacy_creation_date'].split("-")[0]
    return year




[docs]
def get_subject_area(metadata):
    subject_area = deepcopy(parsed_content_defaults['subject_area'])
    if 'arxiv_eprints' in metadata.keys() and 'categories' in metadata['arxiv_eprints'][0]:
        subject_area += metadata['arxiv_eprints'][0]['categories']
    if ('inspire_categories' in metadata.keys() and len(metadata['inspire_categories']) > 0):
        subject_area += [entry['term'].replace('Experiment-HEP', 'hep-ex').replace('Experiment-Nucl', 'nucl-ex')
                         .replace('Theory-Nucl', 'nucl-th').replace('Phenomenology-HEP', 'hep-ph')
                         .replace('Theory-HEP', 'hep-th') for
                         entry in metadata['inspire_categories'] if 'term' in entry.keys() and entry['term'] != 'Other']
    subject_area = list(set(subject_area))
    return subject_area




[docs]
def updated_parsed_content_for_thesis(content, parsed_content):
    parsed_content['dissertation'] = content['metadata']['thesis_info']
    # fix dissertation/institutions -> dissertation/institution if there is only one
    if ('institutions' in parsed_content['dissertation'].keys() and
       len(parsed_content['dissertation']['institutions']) == 1 and
       'name' in parsed_content['dissertation']['institutions'][0]):
        parsed_content['dissertation']['institution'] = parsed_content['dissertation']['institutions'][0]['name']
        parsed_content['dissertation'].pop('institutions')
    # update year with thesis info
    if 'date' in content['metadata']['thesis_info'].keys():
        parsed_content['year'] = content['metadata']['thesis_info']['date']
        if parsed_content['year'] is not None:
            if ('legacy_creation_date' in content['metadata'].keys() and
                    content['metadata']['legacy_creation_date'][:4] == parsed_content['year']):
                parsed_content['creation_date'] = content['metadata']['legacy_creation_date']
            else:
                parsed_content['creation_date'] = expand_date(parsed_content['year'])
    # fix capitals in dissertation/type
    if 'degree_type' in parsed_content['dissertation'].keys():
        parsed_content['dissertation']['type'] = parsed_content['dissertation'].pop('degree_type').title()
        if parsed_content['dissertation']['type'] == "Phd":
            parsed_content['dissertation']['type'] = "PhD"
    # fix dissertation/defence_date string
    if 'date' in parsed_content['dissertation'].keys():
        parsed_content['dissertation']['defense_date'] = parsed_content['dissertation'].pop('date')
    return parsed_content




[docs]
def expand_date(value):
    """
    In the case where the date is not completely
    formed, we need to expand it out.
    so 2012-08 will be 2012-08-01
    and 2012 will be 2012-01-01.
    If nothing, we do nothing.
    """
    if value == '':
        return value

    date_parts = value.split('-')

    if len(date_parts) == 1:
        date_parts.append('01')
    if len(date_parts) == 2:
        date_parts.append('01')
    return "-".join(date_parts)
Source code for hepdata.modules.inspire_api.parser

HEPData

Navigation

Related Topics