Source code for hepdata.modules.records.utils.doi_minter

# This file is part of HEPData.
# Copyright (C) 2016 CERN.
#
# HEPData is free software; you can redistribute it
# and/or modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# HEPData is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with HEPData; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
# MA 02111-1307, USA.
#
# In applying this license, CERN does not
# waive the privileges and immunities granted to it by virtue of its status
# as an Intergovernmental Organization or submit itself to any jurisdiction.
import os

from celery import shared_task
from datacite.errors import DataCiteUnauthorizedError, DataCiteError
from flask import render_template, current_app
from invenio_db import db
from invenio_pidstore.errors import PIDInvalidAction, PIDDoesNotExistError
from invenio_pidstore.models import PersistentIdentifier

from invenio_pidstore.providers.datacite import DataCiteProvider
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm.exc import NoResultFound

from hepdata.modules.submission.models import DataSubmission, HEPSubmission, DataResource, License
from hepdata.modules.records.utils.common import get_record_by_id
import logging

logging.basicConfig()
log = logging.getLogger(__name__)


[docs] @shared_task def generate_doi_for_table(doi): """ Generate DOI for a specific table given by its doi. :param doi: :return: """ site_url = current_app.config.get('SITE_URL', 'https://www.hepdata.net') try: data_submission = DataSubmission.query.filter_by(doi=doi).one() except NoResultFound: print('Table DOI {} not found in database'.format(doi)) return hep_submission = HEPSubmission.query.filter_by( inspire_id=data_submission.publication_inspire_id, version=data_submission.version, overall_status='finished' ).first() if hep_submission: create_data_doi.delay(hep_submission.id, data_submission.id, site_url) else: print('Finished submission with INSPIRE ID {} and version {} not found in database'.format( data_submission.publication_inspire_id, data_submission.version) )
[docs] @shared_task def generate_dois_for_submission(*args, **kwargs): """ Generate DOIs for all the submission components. :param args: :param kwargs: :return: """ site_url = current_app.config.get('SITE_URL', 'https://www.hepdata.net') hep_submissions = HEPSubmission.query.filter_by(**kwargs).order_by(HEPSubmission.publication_recid.asc()).all() for hep_submission in hep_submissions: if args: start_recid, end_recid = args if hep_submission.publication_recid < start_recid or hep_submission.publication_recid > end_recid: continue if hep_submission.overall_status != 'finished': continue data_submissions = DataSubmission.query.filter_by(publication_inspire_id=hep_submission.inspire_id, version=hep_submission.version).order_by( DataSubmission.id.asc()) file_resources = _get_submission_file_resources( hep_submission.publication_recid, hep_submission.version, hep_submission) if hep_submission.doi is None: reserve_doi_for_hepsubmission(hep_submission) if any(d.doi is None for d in data_submissions): reserve_dois_for_data_submissions(data_submissions=data_submissions) if any(r.doi is None for r in file_resources): reserve_dois_for_resources(publication_recid=hep_submission.publication_recid, version=hep_submission.version, resources=file_resources) create_container_doi.delay(hep_submission.id, [d.id for d in data_submissions], [r.id for r in file_resources], site_url) for data_submission in data_submissions: create_data_doi.delay(hep_submission.id, data_submission.id, site_url) for resource in file_resources: create_resource_doi.delay(hep_submission.id, resource.id, site_url)
[docs] @shared_task(max_retries=6, default_retry_delay=10 * 60) def create_container_doi(hep_submission_id, data_submission_ids, resource_ids, site_url): """ Creates the payload to wrap the whole submission. :param hep_submission: :param data_submissions: :param resource_ids: :param publication_info: :return: """ hep_submission = db.session.query(HEPSubmission).get(hep_submission_id) data_submissions = db.session.query(DataSubmission).filter( DataSubmission.id.in_(data_submission_ids) ).all() resources = db.session.query(DataResource).filter( DataResource.id.in_(resource_ids) ).all() publication_info = get_record_by_id(hep_submission.publication_recid) version_doi = hep_submission.doi + ".v{0}".format(hep_submission.version) base_xml = render_template('hepdata_records/formats/datacite/datacite_container_submission.xml', doi=hep_submission.doi, overall_submission=hep_submission, data_submissions=data_submissions, publication_info=publication_info, site_url=site_url) version_xml = render_template('hepdata_records/formats/datacite/datacite_container_submission.xml', doi=version_doi, overall_submission=hep_submission, data_submissions=data_submissions, publication_info=publication_info, site_url=site_url) # Register DOI for the version, and update the base DOI to resolve to the latest submission version. register_doi(hep_submission.doi, site_url + '/record/ins{0}'.format(publication_info['inspire_id']), base_xml, publication_info['uuid']) register_doi(version_doi, site_url + '/record/ins{0}?version={1}'.format( publication_info['inspire_id'], hep_submission.version), version_xml, publication_info['uuid'])
[docs] @shared_task(max_retries=6, default_retry_delay=10 * 60) def create_data_doi(hep_submission_id, data_submission_id, site_url): """ Generate DOI record for a data record. :param data_submission_id: :param version: :return: """ hep_submission = db.session.query(HEPSubmission).get(hep_submission_id) data_submission = db.session.query(DataSubmission).get(data_submission_id) data_file = DataResource.query.filter_by(id=data_submission.data_file).first() publication_info = get_record_by_id(hep_submission.publication_recid) license = None if data_file: if data_file.file_license: license = License.query.filter_by(id=data_file.file_license).first() xml = render_template('hepdata_records/formats/datacite/datacite_data_record.xml', doi=data_submission.doi, table_name=data_submission.name, table_description=data_submission.description, overall_submission=hep_submission, data_submission=data_submission, license=license, publication_info=publication_info, site_url=site_url) register_doi(data_submission.doi, site_url + '/record/{0}'.format(data_submission.associated_recid), xml, publication_info['uuid'])
[docs] @shared_task(max_retries=6, default_retry_delay=10 * 60) def create_resource_doi(hep_submission_id, resource_id, site_url): """ Generate DOI record for a data resource :param resource_id: :param version: :return: """ hep_submission = db.session.query(HEPSubmission).get(hep_submission_id) resource = db.session.query(DataResource).get(resource_id) publication_info = get_record_by_id(hep_submission.publication_recid) license = None if resource.file_license: license = License.query.filter_by(id=resource.file_license).first() xml = render_template( 'hepdata_records/formats/datacite/datacite_resource.xml', resource=resource, doi=resource.doi, overall_submission=hep_submission, filename=os.path.basename(resource.file_location), license=license, publication_info=publication_info, site_url=site_url ) register_doi( resource.doi, site_url + '/record/resource/{0}?landing_page=true'.format(resource.id), xml, publication_info['uuid'] )
[docs] def reserve_doi_for_hepsubmission(hepsubmission, update=False): base_doi = "{0}/hepdata.{1}".format( current_app.config.get('DOI_PREFIX'), hepsubmission.publication_recid) version = hepsubmission.version if version == 0: version += 1 if hepsubmission.doi is None: get_or_create_doi(base_doi) hepsubmission.doi = base_doi db.session.add(hepsubmission) db.session.commit() if not update: get_or_create_doi(base_doi + ".v{0}".format(version))
[docs] def reserve_dois_for_data_submissions(*args, **kwargs): """ Reserves a DOI for a data submission and saves to the datasubmission object. :param data_submission: DataSubmission object representing a data table. :return: """ if kwargs.get('data_submissions'): data_submissions = kwargs.get('data_submissions') elif kwargs.get('publication_inspire_id') or kwargs.get('publication_recid'): data_submissions = DataSubmission.query.filter_by(**kwargs).order_by(DataSubmission.id.asc()) else: raise KeyError('No inspire_id or data_submissions parameter provided') for index, data_submission in enumerate(data_submissions): # using the index of the sorted submissions should do a good job of maintaining the order of the tables. version = data_submission.version if version == 0: version += 1 doi_value = "{0}/hepdata.{1}.v{2}/t{3}".format( current_app.config.get('DOI_PREFIX'), data_submission.publication_recid, version, (index + 1)) if data_submission.doi is None: get_or_create_doi(doi_value) data_submission.doi = doi_value db.session.add(data_submission) db.session.commit()
[docs] def reserve_dois_for_resources(publication_recid, version, resources=None): """ Reserves a DOI for a data submission and saves to the datasubmission object. :param resources: list of DataResource objects :return: """ if not resources: resources = _get_submission_file_resources(publication_recid, version) for index, resource in enumerate(resources): # using the index of the sorted resources should do a good job of maintaining the order of the tables. if version == 0: version += 1 doi_value = "{0}/hepdata.{1}.v{2}/r{3}".format( current_app.config.get('DOI_PREFIX'), publication_recid, version, (index + 1)) if resource.doi is None: get_or_create_doi(doi_value) resource.doi = doi_value db.session.add(resource) db.session.commit()
[docs] def get_or_create_doi(doi): """ :param doi: Creates a DOI using the data provider. If it already exists, we return back the existing provider. :return: DataCiteProvider """ if current_app.config.get('NO_DOI_MINTING', False): # pragma: no cover log.info(f"Would create DOI {doi}") return None try: # Check if DOI already exists and return return DataCiteProvider.get(doi, 'doi') except PIDDoesNotExistError: # DOI does not exist so create it try: return DataCiteProvider.create(doi) except Exception as e: log.error(f'Unable to mint DOI: {str(e)}', exc_info=True) except Exception as e: log.error(f'Unable to fetch DOI: {str(e)}', exc_info=True)
[docs] def register_doi(doi, url, xml, uuid): """ Given a data submission id, this method takes its assigned DOI, creates the DataCite XML, and registers the DOI. :param data_submissions: :param recid: :return: """ if current_app.config.get('NO_DOI_MINTING', False) or not doi: # pragma: no cover log.info(f"Would mint DOI {doi}") return None log.info('{0} - {1}'.format(doi, url)) print('Minting doi {}'.format(doi)) provider = get_or_create_doi(doi) pidstore_obj = PersistentIdentifier.query.filter_by(pid_value=doi).first() if pidstore_obj: pidstore_obj.object_uuid = uuid db.session.add(pidstore_obj) db.session.commit() try: provider.register(url, xml) except DataCiteUnauthorizedError: log.error('Unable to mint DOI. No authorisation credentials provided.') except (PIDInvalidAction, IntegrityError): try: provider.update(url, xml) # try again in case of temporary problem except DataCiteError: try: provider.update(url, xml) except DataCiteError as dce: log.error('Error updating {0} for URL {1}\n\n{2}'.format(doi, url, dce)) except DataCiteError: try: provider.register(url, xml) # try again in case of temporary problem except (PIDInvalidAction, IntegrityError): try: provider.update(url, xml) except DataCiteError as dce: log.error('Error updating {0} for URL {1}\n\n{2}'.format(doi, url, dce)) except DataCiteError as dce: log.error('Error registering {0} for URL {1}\n\n{2}'.format(doi, url, dce))
def _get_submission_file_resources(recid, version, submission=None): """ Gets a list of resources for a publication, relevant to all data records. :param recid: :param version: :return: list of DataResource objects """ if submission is None: submission = HEPSubmission.query.filter_by(publication_recid=recid, version=version).first() file_resources = [ r for r in submission.resources if not r.file_location.lower().startswith('http') ] file_resources.sort(key=lambda r: r.id) return file_resources