Source code for hepdata.modules.converter.views

#
# This file is part of HEPData.
# Copyright (C) 2016 CERN.
#
# HEPData is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# HEPData is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with HEPData; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
#

"""HEPData Converter Views."""

import fileinput
import logging
import os
import re
import shutil
import tempfile

from flask import Blueprint, send_file, render_template, \
    current_app, redirect, abort
from hepdata.config import CFG_CONVERTER_URL, CFG_SUPPORTED_FORMATS, CFG_CONVERTER_TIMEOUT

from hepdata_converter_ws_client import convert, Error
from hepdata.modules.permissions.api import user_allowed_to_perform_action
from hepdata.modules.converter import convert_zip_archive
from hepdata.modules.submission.api import get_latest_hepsubmission
from hepdata.modules.submission.models import HEPSubmission, DataResource, DataSubmission
from hepdata.utils.file_extractor import extract, get_file_in_directory
from hepdata.modules.records.utils.common import get_record_contents, \
    find_file_in_directory
from hepdata.modules.records.utils.data_files import get_converted_directory_path, \
    find_submission_data_file_path, get_data_path_for_record


from sqlalchemy.orm.exc import NoResultFound
from sqlalchemy import func, or_

from dateutil.parser import parse

logging.basicConfig()
log = logging.getLogger(__name__)

blueprint = Blueprint('converter', __name__,
                      url_prefix="/download",
                      template_folder='templates',
                      static_folder='static')

FORMATS = ','.join(['json'] + CFG_SUPPORTED_FORMATS)


[docs] @blueprint.route(f'/submission/<inspire_id>/<any({FORMATS}):file_format>') @blueprint.route(f'/submission/<inspire_id>/<int:version>/<any({FORMATS}):file_format>') @blueprint.route('/submission/<inspire_id>/<int:version>/<any(yoda,yoda1):file_format>/<rivet>') def download_submission_with_inspire_id(*args, **kwargs): """ Gets the submission file and either serves it back directly from YAML, or converts it for other formats. Routes:\n ``/submission/<inspire_id>/<file_format>``\n ``/submission/<inspire_id>/<int:version>/<file_format>``\n ``/submission/<inspire_id>/<int:version>/<file_format>/<rivet>`` :param inspire_id: inspire id :param version: version of submission to export. If absent, returns the latest. :param file_format: json, yaml, csv, root, yoda, yoda1 or original :param rivet: Rivet analysis name to override default written in YODA export :return: download_submission """ inspire_id = kwargs.pop('inspire_id') if 'ins' in inspire_id: inspire_id = inspire_id.replace('ins', '') submission = get_latest_hepsubmission(inspire_id=inspire_id) if not submission: return display_error( title="No submission found", description="A submission with Inspire ID {0} does not exist".format(inspire_id) ) recid = submission.publication_recid version_count, version_count_all = get_version_count(recid) if 'version' in kwargs: version = kwargs.pop('version') else: # If version not given explicitly, take to be latest allowed version (or 1 if there are no allowed versions). version = version_count if version_count else 1 if version_count < version_count_all and version == version_count_all: # Check for a user trying to access a version of a publication record where they don't have permissions. abort(403) elif version < version_count_all: submission = HEPSubmission.query.filter_by(inspire_id=inspire_id, version=version).first() if not submission: return display_error( title="No submission found", description="A submission with Inspire ID {0} and version {1} does not exist".format(inspire_id, version) ) return download_submission(submission, kwargs.pop('file_format'), rivet_analysis_name=kwargs.pop('rivet', ''))
[docs] @blueprint.route(f'/submission/<int:recid>/<any({FORMATS}):file_format>') @blueprint.route(f'/submission/<int:recid>/<int:version>/<any({FORMATS}):file_format>') @blueprint.route('/submission/<int:recid>/<int:version>/<any(yoda,yoda1):file_format>/<rivet>') def download_submission_with_recid(*args, **kwargs): """ Gets the submission file and either serves it back directly from YAML, or converts it for other formats. Routes:\n ``/submission/<int:recid>/<file_format>``\n ``/submission/<int:recid>/<int:version>/<file_format>``\n ``/submission/<int:recid>/<int:version>/<file_format>/<rivet>``\n :param recid: submissions recid :param version: version of submission to export. If absent, returns the latest. :param file_format: json, yaml, csv, root, yoda, yoda1 or original :param rivet: Rivet analysis name to override default written in YODA export :return: download_submission """ recid = kwargs.pop('recid') version_count, version_count_all = get_version_count(recid) if 'version' in kwargs: version = kwargs.pop('version') else: # If version not given explicitly, take to be latest allowed version (or 1 if there are no allowed versions). version = version_count if version_count else 1 # Check for a user trying to access a version of a publication record where they don't have permissions. if version_count < version_count_all and version == version_count_all: abort(403) submission = HEPSubmission.query.filter_by(publication_recid=recid, version=version).first() if not submission: return display_error( title="No submission found", description="A submission with record ID {0} and version {1} does not exist".format(recid, version) ) return download_submission(submission, kwargs.pop('file_format'), rivet_analysis_name=kwargs.pop('rivet', ''))
[docs] def download_submission(submission, file_format, offline=False, force=False, rivet_analysis_name=''): """ Gets the submission file and either serves it back directly from YAML, or converts it for other formats. :param submission: HEPSubmission :param file_format: json, yaml, csv, root, yoda, yoda1 or original :param offline: offline creation of the conversion when a record is finalised :param force: force recreation of the conversion :param rivet_analysis_name: Rivet analysis name to override default written in YODA export :return: display_error or send_file depending on success of conversion """ version = submission.version file_identifier = submission.publication_recid if submission.inspire_id: file_identifier = 'ins{0}'.format(submission.inspire_id) if file_format == 'json': return redirect('/record/{0}?version={1}&format=json'.format(file_identifier, version)) elif file_format not in CFG_SUPPORTED_FORMATS: if offline: log.error('Format not supported') return display_error( title="The " + file_format + " output format is not supported", description="This output format is not supported. " + "Currently supported formats: " + str(CFG_SUPPORTED_FORMATS), ) data_filepath = find_submission_data_file_path(submission) if file_format == 'original': file_format_and_extension = os.path.splitext(data_filepath)[1] else: file_format_and_extension = '-{0}.tar.gz'.format(file_format) output_file = 'HEPData-{0}-v{1}{2}'.format(file_identifier, submission.version, file_format_and_extension) converted_dir = get_converted_directory_path(submission.publication_recid) if not os.path.exists(converted_dir): os.makedirs(converted_dir, exist_ok=True) if file_format.startswith('yoda') and rivet_analysis_name: # Don't store in converted_dir since rivet_analysis_name might possibly change between calls. output_path = os.path.join(current_app.config['CFG_TMPDIR'], output_file) else: output_path = os.path.join(converted_dir, output_file) # If the file is already available in the dir, send it back # unless we are forcing recreation of the file or the submission is not finished. if os.path.exists(output_path) and not force and submission.overall_status == 'finished': if not offline: return send_file(output_path, as_attachment=True) else: print('File already downloaded at {0}'.format(output_path)) return if file_format == 'original': create_original_with_resources(submission, data_filepath, output_path) if not offline: return send_file(output_path, as_attachment=True) else: print('File created at {0}'.format(output_path)) return converter_options = { 'input_format': 'yaml', 'output_format': file_format, 'filename': 'HEPData-{0}-v{1}-{2}'.format(file_identifier, submission.version, file_format), 'validator_schema_version': '0.1.0', } if submission.doi and not submission.overall_status.startswith('sandbox'): converter_options['hepdata_doi'] = '{0}.v{1}'.format(submission.doi, version) if file_format.startswith('yoda'): if not rivet_analysis_name: rivet_analysis_name = guess_rivet_analysis_name(submission) if rivet_analysis_name: converter_options['rivet_analysis_name'] = rivet_analysis_name try: converted_file = convert_zip_archive(data_filepath, output_path, converter_options) if not offline: return send_file(converted_file, as_attachment=True) else: print('File for {0} created successfully at {1}'.format(file_identifier, output_path)) except Error as error: # hepdata_converter_ws_client.Error if not offline: return display_error(title='Report concerns to info@hepdata.net', description=str(error)) else: print('File conversion for {0} at {1} failed: {2}'.format( file_identifier, output_path, str(error) ))
[docs] @blueprint.route(f'/table/<inspire_id>/<path:table_name>/<any({FORMATS}):file_format>') @blueprint.route(f'/table/<inspire_id>/<path:table_name>/<int:version>/<any({FORMATS}):file_format>') @blueprint.route('/table/<inspire_id>/<path:table_name>/<int:version>/<any(yoda,yoda1):file_format>/<rivet>') def download_data_table_by_inspire_id(*args, **kwargs): """ Downloads the latest data file given the url ``/download/submission/ins1283842/Table 1/yaml`` or by a particular version given ``/download/submission/ins1283842/Table 1/1/yaml``. Routes:\n ``/table/<inspire_id>/<path:table_name>/<file_format>``\n ``/table/<inspire_id>/<path:table_name>/<int:version>/<file_format>``\n ``/table/<inspire_id>/<path:table_name>/<int:version>/<file_format>/<rivet>``\n :param args: :param kwargs: inspire_id, table_name, version (optional), and file_format :return: display_error or download_datatable depending on success of conversion """ inspire_id = kwargs.pop('inspire_id') table_name = kwargs.pop('table_name') rivet = kwargs.pop('rivet', '') if 'ins' in inspire_id: inspire_id = inspire_id.replace('ins', '') submission = get_latest_hepsubmission(inspire_id=inspire_id) if not submission: return display_error( title="No submission found", description="A submission with Inspire ID {0} does not exist".format(inspire_id) ) recid = submission.publication_recid version_count, version_count_all = get_version_count(recid) if 'version' in kwargs: version = kwargs.pop('version') else: # If version not given explicitly, take to be latest allowed version (or 1 if there are no allowed versions). version = version_count if version_count else 1 if version_count < version_count_all and version == version_count_all: # Check for a user trying to access a version of a publication record where they don't have permissions. abort(403) datasubmission = None original_table_name = table_name try: datasubmission = DataSubmission.query.filter_by(publication_inspire_id=inspire_id, version=version, name=table_name).one() except NoResultFound: if ' ' not in table_name: # Allow spaces in table_name to be omitted from URL. try: datasubmission = DataSubmission.query.filter( DataSubmission.publication_inspire_id == inspire_id, DataSubmission.version == version, func.replace(DataSubmission.name, ' ', '') == table_name ).one() except NoResultFound: pass if not datasubmission: return display_error( title="No data submission found", description="A data submission with Inspire ID {0}, version {1} and table name '{2}' does not exist" .format(inspire_id, version, original_table_name) ) return download_datatable(datasubmission, kwargs.pop('file_format'), submission_id='ins{0}'.format(inspire_id), table_name=datasubmission.name, rivet_analysis_name=rivet)
[docs] @blueprint.route(f'/table/<int:recid>/<path:table_name>/<any({FORMATS}):file_format>') @blueprint.route(f'/table/<int:recid>/<path:table_name>/<int:version>/<any({FORMATS}):file_format>') @blueprint.route('/table/<int:recid>/<path:table_name>/<int:version>/<any(yoda,yoda1):file_format>/<rivet>') def download_data_table_by_recid(*args, **kwargs): """ Record ID download. Downloads the latest data file given the url ``/download/submission/1231/Table 1/yaml`` or by a particular version given ``/download/submission/1231/Table 1/1/yaml``. Routes: ``/table/<int:recid>/<path:table_name>/<file_format>``\n ``/table/<int:recid>/<path:table_name>/<int:version>/<file_format>``\n ``/table/<int:recid>/<path:table_name>/<int:version>/<file_format>/<rivet>``\n :param args: :param kwargs: inspire_id, table_name, version (optional), and file_format :return: display_error or download_datatable depending on success of conversion """ recid = kwargs.pop('recid') table_name = kwargs.pop('table_name') rivet = kwargs.pop('rivet', '') version_count, version_count_all = get_version_count(recid) if 'version' in kwargs: version = kwargs.pop('version') else: # If version not given explicitly, take to be latest allowed version (or 1 if there are no allowed versions). version = version_count if version_count else 1 # Check for a user trying to access a version of a publication record where they don't have permissions. if version_count < version_count_all and version == version_count_all: abort(403) datasubmission = None original_table_name = table_name try: datasubmission = DataSubmission.query.filter_by(publication_recid=recid, version=version, name=table_name).one() except NoResultFound: if ' ' not in table_name: try: # Allow spaces in table_name to be omitted from URL. datasubmission = DataSubmission.query.filter( DataSubmission.publication_recid == recid, DataSubmission.version == version, func.replace(DataSubmission.name, ' ', '') == table_name ).one() except NoResultFound: pass if not datasubmission: return display_error( title="No data submission found", description="A data submission with record ID {0}, version {1} and table name '{2}' does not exist" .format(recid, version, original_table_name) ) return download_datatable(datasubmission, kwargs.pop('file_format'), submission_id='{0}'.format(recid), table_name=datasubmission.name, rivet_analysis_name=rivet)
[docs] @blueprint.route(f'/table/<int:data_id>/<any({FORMATS}):file_format>') def download_datatable_by_dataid(data_id, file_format): """ Download a particular data table in a given format. :param data_id: :param file_format: :return: download_datatable """ datasubmission = DataSubmission.query.filter_by(id=data_id).one() return download_datatable(datasubmission, file_format, submission_id=data_id)
[docs] def download_datatable(datasubmission, file_format, *args, **kwargs): """ Download a particular data table given a ``datasubmission``. :param datasubmission: :param file_format: :param args: :param kwargs: :return: display_error or send_file depending on success of conversion """ if file_format == 'json': return redirect('/record/data/{0}/{1}/{2}'.format(datasubmission.publication_recid, datasubmission.id, datasubmission.version)) elif file_format not in CFG_SUPPORTED_FORMATS: return display_error( title="The " + file_format + " output format is not supported", description="This output format is not supported. " + "Currently supported formats: " + str(CFG_SUPPORTED_FORMATS), ) dataresource = DataResource.query.filter_by(id=datasubmission.data_file).one() record_path, table_name = os.path.split(dataresource.file_location) filename = 'HEPData-{0}-v{1}'.format(kwargs.pop('submission_id'), datasubmission.version) if 'table_name' in kwargs: filename += '-' + kwargs.pop('table_name').replace(' ', '_').replace('/', '_').replace('$', '').replace('\\','') output_path = os.path.join(current_app.config['CFG_TMPDIR'], filename) if file_format == 'yaml' or file_format == 'original': return send_file( dataresource.file_location, as_attachment=True, download_name=filename + '.yaml' ) options = { 'input_format': 'yaml', 'output_format': file_format, 'table': table_name, 'filename': table_name.split('.')[0], 'validator_schema_version': '0.1.0', } hepsubmission = HEPSubmission.query.filter_by(publication_recid=datasubmission.publication_recid, version=datasubmission.version).first() if datasubmission.doi and not hepsubmission.overall_status.startswith('sandbox'): options['hepdata_doi'] = datasubmission.doi.rsplit('/', 1)[0] if file_format.startswith('yoda'): rivet_analysis_name = kwargs.pop('rivet_analysis_name', '') if not rivet_analysis_name: rivet_analysis_name = guess_rivet_analysis_name(hepsubmission) if rivet_analysis_name: options['rivet_analysis_name'] = rivet_analysis_name try: successful = convert( current_app.config.get('CFG_CONVERTER_URL', CFG_CONVERTER_URL), record_path, output=output_path + '-dir', options=options, extract=False, timeout=CFG_CONVERTER_TIMEOUT, ) except Error as error: # hepdata_converter_ws_client.Error return display_error(title='Report concerns to info@hepdata.net', description=str(error)) if successful: new_path = output_path + "." + file_format new_path = extract(output_path + '-dir', new_path) os.remove(output_path + '-dir') file_format = file_format[:-1] if file_format == 'yoda1' else file_format file_to_send = get_file_in_directory(new_path, file_format) else: # Error occurred, the output is a HTML file file_to_send = output_path + '-dir' file_format = 'html' return send_file(file_to_send, as_attachment=True, download_name=filename + '.' + file_format)
[docs] def display_error(title='Unknown Error', description=''): """ Return an HTML page containing a description of the conversion error. :param title: :param description: :return: render_template """ return render_template( 'hepdata_records/error_page.html', header_message='Converter error encountered', message=title, errors={ "Converter": [{ "level": "error", "message": description }] } )
[docs] def create_original_with_resources(submission, data_filepath, output_path): """Copy or create 'original' zip file, i.e. yaml files with resources. If resources were imported from hepdata.cedar.ac.uk we create a new zip in a format that could be re-uploaded as a submission. :param type submission: HEPSubmission object :param type data_filepath: Path to original file :param type output_path: Path to output file (in converted dir) :return: None """ resource_location = os.path.join( get_data_path_for_record(str(submission.publication_recid)), 'resources' ) if os.path.isdir(resource_location): # There is a resources directory from when this record was imported # from the old hepdata site. We need to create a new zip with the # contents of data_filepath and resources with tempfile.TemporaryDirectory(dir=current_app.config['CFG_TMPDIR']) as tmpdir: # Copy resources directory into 'contents' dir in temp directory contents_path = os.path.join(tmpdir, 'contents') shutil.copytree(resource_location, contents_path) # Unzip data_filepath into contents path shutil.unpack_archive(data_filepath, contents_path) # Need to go through the submission file and update the paths so # that all resources are at the top level. This should allow the # zip to be re-uploaded or imported submission_found = find_file_in_directory( contents_path, lambda x: x == "submission.yaml" ) if submission_found: with fileinput.FileInput(submission_found[1], inplace=True) as file: p = re.compile(r'(\s+location: )\/resource\/.*\/([^\/]+)') for line in file: print(p.sub(r'\g<1>\g<2>', line), end='') # Zip up contents dir into a new file base, ext = os.path.splitext(output_path) zip_type = 'zip' if ext == '.zip' else 'gztar' print("Creating archive at %s" % output_path) shutil.make_archive(base, zip_type, contents_path) else: shutil.copy2(data_filepath, output_path)
[docs] def get_version_count(recid): """ Returns both the number of *allowed* versions and the number of *all* versions. :param recid: :return: version_count, version_count_all """ # Count number of all versions and number of finished versions of a publication record. version_count_all = HEPSubmission.query.filter_by(publication_recid=recid).count() version_count_finished = HEPSubmission.query.filter_by(publication_recid=recid, overall_status='finished').count() version_count_sandbox = HEPSubmission.query.filter( HEPSubmission.publication_recid == recid, or_(HEPSubmission.overall_status == 'sandbox', HEPSubmission.overall_status == 'sandbox_processing') ).count() if version_count_sandbox: # For a Sandbox record, there is only one version, which is accessible by everyone. version_count = version_count_all else: # Number of versions that a user is allowed to access based on their permissions. version_count = version_count_all if user_allowed_to_perform_action(recid) else version_count_finished return version_count, version_count_all
[docs] def guess_rivet_analysis_name(submission): """ Try to guess the Rivet analysis name. :param submission: HEPSubmission object :return: guessed Rivet analysis name """ rivet_analysis_name = '' # Check if this submission has a Rivet analysis as additional resources, # then extract the Rivet analysis name from the URL. for resource in submission.resources: if resource.file_type == 'rivet': rivet_analysis_name = resource.file_location.split('/')[-1] if not rivet_analysis_name: # Otherwise guess the Rivet analysis name using the collaboration name, # the creation year of the INSPIRE record, and the INSPIRE ID. record = get_record_contents(submission.publication_recid, submission.overall_status) if record and 'inspire_id' in record and record['inspire_id']: try: year = parse(record['creation_date']).year except: year = record['year'] # publication year rivet_analysis_name = '{0}_{1}_I{2}'.format(''.join( record['collaborations']).upper(), year, record['inspire_id']) return rivet_analysis_name