#
# This file is part of HEPData.
# Copyright (C) 2016 CERN.
#
# HEPData is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# HEPData is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with HEPData; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
#
"""HEPData Converter Views."""
import fileinput
import logging
import os
import re
import shutil
import tempfile
from flask import Blueprint, send_file, render_template, \
current_app, redirect, abort
from hepdata.config import CFG_CONVERTER_URL, CFG_SUPPORTED_FORMATS, CFG_CONVERTER_TIMEOUT
from hepdata_converter_ws_client import convert, Error
from hepdata.modules.permissions.api import user_allowed_to_perform_action
from hepdata.modules.converter import convert_zip_archive
from hepdata.modules.submission.api import get_latest_hepsubmission
from hepdata.modules.submission.models import HEPSubmission, DataResource, DataSubmission
from hepdata.utils.file_extractor import extract, get_file_in_directory
from hepdata.modules.records.utils.common import get_record_contents, \
find_file_in_directory
from hepdata.modules.records.utils.data_files import get_converted_directory_path, \
find_submission_data_file_path, get_data_path_for_record
from sqlalchemy.orm.exc import NoResultFound
from sqlalchemy import func, or_
from dateutil.parser import parse
logging.basicConfig()
log = logging.getLogger(__name__)
blueprint = Blueprint('converter', __name__,
url_prefix="/download",
template_folder='templates',
static_folder='static')
FORMATS = ','.join(['json'] + CFG_SUPPORTED_FORMATS)
[docs]
@blueprint.route(f'/submission/<inspire_id>/<any({FORMATS}):file_format>')
@blueprint.route(f'/submission/<inspire_id>/<int:version>/<any({FORMATS}):file_format>')
@blueprint.route('/submission/<inspire_id>/<int:version>/<any(yoda,yoda1):file_format>/<rivet>')
def download_submission_with_inspire_id(*args, **kwargs):
"""
Gets the submission file and either serves it back directly from YAML, or converts it
for other formats. Routes:\n
``/submission/<inspire_id>/<file_format>``\n
``/submission/<inspire_id>/<int:version>/<file_format>``\n
``/submission/<inspire_id>/<int:version>/<file_format>/<rivet>``
:param inspire_id: inspire id
:param version: version of submission to export. If absent, returns the latest.
:param file_format: json, yaml, csv, root, yoda, yoda1 or original
:param rivet: Rivet analysis name to override default written in YODA export
:return: download_submission
"""
inspire_id = kwargs.pop('inspire_id')
if 'ins' in inspire_id:
inspire_id = inspire_id.replace('ins', '')
submission = get_latest_hepsubmission(inspire_id=inspire_id)
if not submission:
return display_error(
title="No submission found",
description="A submission with Inspire ID {0} does not exist".format(inspire_id)
)
recid = submission.publication_recid
version_count, version_count_all = get_version_count(recid)
if 'version' in kwargs:
version = kwargs.pop('version')
else:
# If version not given explicitly, take to be latest allowed version (or 1 if there are no allowed versions).
version = version_count if version_count else 1
if version_count < version_count_all and version == version_count_all:
# Check for a user trying to access a version of a publication record where they don't have permissions.
abort(403)
elif version < version_count_all:
submission = HEPSubmission.query.filter_by(inspire_id=inspire_id, version=version).first()
if not submission:
return display_error(
title="No submission found",
description="A submission with Inspire ID {0} and version {1} does not exist".format(inspire_id, version)
)
return download_submission(submission, kwargs.pop('file_format'), rivet_analysis_name=kwargs.pop('rivet', ''))
[docs]
@blueprint.route(f'/submission/<int:recid>/<any({FORMATS}):file_format>')
@blueprint.route(f'/submission/<int:recid>/<int:version>/<any({FORMATS}):file_format>')
@blueprint.route('/submission/<int:recid>/<int:version>/<any(yoda,yoda1):file_format>/<rivet>')
def download_submission_with_recid(*args, **kwargs):
"""
Gets the submission file and either serves it back directly from YAML, or converts it
for other formats. Routes:\n
``/submission/<int:recid>/<file_format>``\n
``/submission/<int:recid>/<int:version>/<file_format>``\n
``/submission/<int:recid>/<int:version>/<file_format>/<rivet>``\n
:param recid: submissions recid
:param version: version of submission to export. If absent, returns the latest.
:param file_format: json, yaml, csv, root, yoda, yoda1 or original
:param rivet: Rivet analysis name to override default written in YODA export
:return: download_submission
"""
recid = kwargs.pop('recid')
version_count, version_count_all = get_version_count(recid)
if 'version' in kwargs:
version = kwargs.pop('version')
else:
# If version not given explicitly, take to be latest allowed version (or 1 if there are no allowed versions).
version = version_count if version_count else 1
# Check for a user trying to access a version of a publication record where they don't have permissions.
if version_count < version_count_all and version == version_count_all:
abort(403)
submission = HEPSubmission.query.filter_by(publication_recid=recid, version=version).first()
if not submission:
return display_error(
title="No submission found",
description="A submission with record ID {0} and version {1} does not exist".format(recid, version)
)
return download_submission(submission, kwargs.pop('file_format'), rivet_analysis_name=kwargs.pop('rivet', ''))
[docs]
def download_submission(submission, file_format, offline=False, force=False, rivet_analysis_name=''):
"""
Gets the submission file and either serves it back directly from YAML, or converts it
for other formats.
:param submission: HEPSubmission
:param file_format: json, yaml, csv, root, yoda, yoda1 or original
:param offline: offline creation of the conversion when a record is finalised
:param force: force recreation of the conversion
:param rivet_analysis_name: Rivet analysis name to override default written in YODA export
:return: display_error or send_file depending on success of conversion
"""
version = submission.version
file_identifier = submission.publication_recid
if submission.inspire_id:
file_identifier = 'ins{0}'.format(submission.inspire_id)
if file_format == 'json':
return redirect('/record/{0}?version={1}&format=json'.format(file_identifier, version))
elif file_format not in CFG_SUPPORTED_FORMATS:
if offline:
log.error('Format not supported')
return display_error(
title="The " + file_format + " output format is not supported",
description="This output format is not supported. " +
"Currently supported formats: " + str(CFG_SUPPORTED_FORMATS),
)
data_filepath = find_submission_data_file_path(submission)
if file_format == 'original':
file_format_and_extension = os.path.splitext(data_filepath)[1]
else:
file_format_and_extension = '-{0}.tar.gz'.format(file_format)
output_file = 'HEPData-{0}-v{1}{2}'.format(file_identifier, submission.version, file_format_and_extension)
converted_dir = get_converted_directory_path(submission.publication_recid)
if not os.path.exists(converted_dir):
os.makedirs(converted_dir, exist_ok=True)
if file_format.startswith('yoda') and rivet_analysis_name:
# Don't store in converted_dir since rivet_analysis_name might possibly change between calls.
output_path = os.path.join(current_app.config['CFG_TMPDIR'], output_file)
else:
output_path = os.path.join(converted_dir, output_file)
# If the file is already available in the dir, send it back
# unless we are forcing recreation of the file or the submission is not finished.
if os.path.exists(output_path) and not force and submission.overall_status == 'finished':
if not offline:
return send_file(output_path, as_attachment=True)
else:
print('File already downloaded at {0}'.format(output_path))
return
if file_format == 'original':
create_original_with_resources(submission, data_filepath, output_path)
if not offline:
return send_file(output_path, as_attachment=True)
else:
print('File created at {0}'.format(output_path))
return
converter_options = {
'input_format': 'yaml',
'output_format': file_format,
'filename': 'HEPData-{0}-v{1}-{2}'.format(file_identifier, submission.version, file_format),
'validator_schema_version': '0.1.0',
}
if submission.doi and not submission.overall_status.startswith('sandbox'):
converter_options['hepdata_doi'] = '{0}.v{1}'.format(submission.doi, version)
if file_format.startswith('yoda'):
if not rivet_analysis_name:
rivet_analysis_name = guess_rivet_analysis_name(submission)
if rivet_analysis_name:
converter_options['rivet_analysis_name'] = rivet_analysis_name
try:
converted_file = convert_zip_archive(data_filepath, output_path, converter_options)
if not offline:
return send_file(converted_file, as_attachment=True)
else:
print('File for {0} created successfully at {1}'.format(file_identifier, output_path))
except Error as error: # hepdata_converter_ws_client.Error
if not offline:
return display_error(title='Report concerns to info@hepdata.net', description=str(error))
else:
print('File conversion for {0} at {1} failed: {2}'.format(
file_identifier, output_path, str(error)
))
[docs]
@blueprint.route(f'/table/<inspire_id>/<path:table_name>/<any({FORMATS}):file_format>')
@blueprint.route(f'/table/<inspire_id>/<path:table_name>/<int:version>/<any({FORMATS}):file_format>')
@blueprint.route('/table/<inspire_id>/<path:table_name>/<int:version>/<any(yoda,yoda1):file_format>/<rivet>')
def download_data_table_by_inspire_id(*args, **kwargs):
"""
Downloads the latest data file given the url ``/download/submission/ins1283842/Table 1/yaml`` or
by a particular version given ``/download/submission/ins1283842/Table 1/1/yaml``. Routes:\n
``/table/<inspire_id>/<path:table_name>/<file_format>``\n
``/table/<inspire_id>/<path:table_name>/<int:version>/<file_format>``\n
``/table/<inspire_id>/<path:table_name>/<int:version>/<file_format>/<rivet>``\n
:param args:
:param kwargs: inspire_id, table_name, version (optional), and file_format
:return: display_error or download_datatable depending on success of conversion
"""
inspire_id = kwargs.pop('inspire_id')
table_name = kwargs.pop('table_name')
rivet = kwargs.pop('rivet', '')
if 'ins' in inspire_id:
inspire_id = inspire_id.replace('ins', '')
submission = get_latest_hepsubmission(inspire_id=inspire_id)
if not submission:
return display_error(
title="No submission found",
description="A submission with Inspire ID {0} does not exist".format(inspire_id)
)
recid = submission.publication_recid
version_count, version_count_all = get_version_count(recid)
if 'version' in kwargs:
version = kwargs.pop('version')
else:
# If version not given explicitly, take to be latest allowed version (or 1 if there are no allowed versions).
version = version_count if version_count else 1
if version_count < version_count_all and version == version_count_all:
# Check for a user trying to access a version of a publication record where they don't have permissions.
abort(403)
datasubmission = None
original_table_name = table_name
try:
datasubmission = DataSubmission.query.filter_by(publication_inspire_id=inspire_id, version=version, name=table_name).one()
except NoResultFound:
if ' ' not in table_name:
# Allow spaces in table_name to be omitted from URL.
try:
datasubmission = DataSubmission.query.filter(
DataSubmission.publication_inspire_id == inspire_id,
DataSubmission.version == version,
func.replace(DataSubmission.name, ' ', '') == table_name
).one()
except NoResultFound:
pass
if not datasubmission:
return display_error(
title="No data submission found",
description="A data submission with Inspire ID {0}, version {1} and table name '{2}' does not exist"
.format(inspire_id, version, original_table_name)
)
return download_datatable(datasubmission, kwargs.pop('file_format'),
submission_id='ins{0}'.format(inspire_id), table_name=datasubmission.name,
rivet_analysis_name=rivet)
[docs]
@blueprint.route(f'/table/<int:recid>/<path:table_name>/<any({FORMATS}):file_format>')
@blueprint.route(f'/table/<int:recid>/<path:table_name>/<int:version>/<any({FORMATS}):file_format>')
@blueprint.route('/table/<int:recid>/<path:table_name>/<int:version>/<any(yoda,yoda1):file_format>/<rivet>')
def download_data_table_by_recid(*args, **kwargs):
"""
Record ID download.
Downloads the latest data file given the url ``/download/submission/1231/Table 1/yaml`` or
by a particular version given ``/download/submission/1231/Table 1/1/yaml``. Routes:
``/table/<int:recid>/<path:table_name>/<file_format>``\n
``/table/<int:recid>/<path:table_name>/<int:version>/<file_format>``\n
``/table/<int:recid>/<path:table_name>/<int:version>/<file_format>/<rivet>``\n
:param args:
:param kwargs: inspire_id, table_name, version (optional), and file_format
:return: display_error or download_datatable depending on success of conversion
"""
recid = kwargs.pop('recid')
table_name = kwargs.pop('table_name')
rivet = kwargs.pop('rivet', '')
version_count, version_count_all = get_version_count(recid)
if 'version' in kwargs:
version = kwargs.pop('version')
else:
# If version not given explicitly, take to be latest allowed version (or 1 if there are no allowed versions).
version = version_count if version_count else 1
# Check for a user trying to access a version of a publication record where they don't have permissions.
if version_count < version_count_all and version == version_count_all:
abort(403)
datasubmission = None
original_table_name = table_name
try:
datasubmission = DataSubmission.query.filter_by(publication_recid=recid, version=version, name=table_name).one()
except NoResultFound:
if ' ' not in table_name:
try:
# Allow spaces in table_name to be omitted from URL.
datasubmission = DataSubmission.query.filter(
DataSubmission.publication_recid == recid,
DataSubmission.version == version,
func.replace(DataSubmission.name, ' ', '') == table_name
).one()
except NoResultFound:
pass
if not datasubmission:
return display_error(
title="No data submission found",
description="A data submission with record ID {0}, version {1} and table name '{2}' does not exist"
.format(recid, version, original_table_name)
)
return download_datatable(datasubmission, kwargs.pop('file_format'),
submission_id='{0}'.format(recid), table_name=datasubmission.name,
rivet_analysis_name=rivet)
[docs]
@blueprint.route(f'/table/<int:data_id>/<any({FORMATS}):file_format>')
def download_datatable_by_dataid(data_id, file_format):
"""
Download a particular data table in a given format.
:param data_id:
:param file_format:
:return: download_datatable
"""
datasubmission = DataSubmission.query.filter_by(id=data_id).one()
return download_datatable(datasubmission, file_format, submission_id=data_id)
[docs]
def download_datatable(datasubmission, file_format, *args, **kwargs):
"""
Download a particular data table given a ``datasubmission``.
:param datasubmission:
:param file_format:
:param args:
:param kwargs:
:return: display_error or send_file depending on success of conversion
"""
if file_format == 'json':
return redirect('/record/data/{0}/{1}/{2}'.format(datasubmission.publication_recid,
datasubmission.id, datasubmission.version))
elif file_format not in CFG_SUPPORTED_FORMATS:
return display_error(
title="The " + file_format + " output format is not supported",
description="This output format is not supported. " +
"Currently supported formats: " + str(CFG_SUPPORTED_FORMATS),
)
dataresource = DataResource.query.filter_by(id=datasubmission.data_file).one()
record_path, table_name = os.path.split(dataresource.file_location)
filename = 'HEPData-{0}-v{1}'.format(kwargs.pop('submission_id'), datasubmission.version)
if 'table_name' in kwargs:
filename += '-' + kwargs.pop('table_name').replace(' ', '_').replace('/', '_').replace('$', '').replace('\\','')
output_path = os.path.join(current_app.config['CFG_TMPDIR'], filename)
if file_format == 'yaml' or file_format == 'original':
return send_file(
dataresource.file_location,
as_attachment=True,
download_name=filename + '.yaml'
)
options = {
'input_format': 'yaml',
'output_format': file_format,
'table': table_name,
'filename': table_name.split('.')[0],
'validator_schema_version': '0.1.0',
}
hepsubmission = HEPSubmission.query.filter_by(publication_recid=datasubmission.publication_recid,
version=datasubmission.version).first()
if datasubmission.doi and not hepsubmission.overall_status.startswith('sandbox'):
options['hepdata_doi'] = datasubmission.doi.rsplit('/', 1)[0]
if file_format.startswith('yoda'):
rivet_analysis_name = kwargs.pop('rivet_analysis_name', '')
if not rivet_analysis_name:
rivet_analysis_name = guess_rivet_analysis_name(hepsubmission)
if rivet_analysis_name:
options['rivet_analysis_name'] = rivet_analysis_name
try:
successful = convert(
current_app.config.get('CFG_CONVERTER_URL', CFG_CONVERTER_URL),
record_path,
output=output_path + '-dir',
options=options,
extract=False,
timeout=CFG_CONVERTER_TIMEOUT,
)
except Error as error: # hepdata_converter_ws_client.Error
return display_error(title='Report concerns to info@hepdata.net', description=str(error))
if successful:
new_path = output_path + "." + file_format
new_path = extract(output_path + '-dir', new_path)
os.remove(output_path + '-dir')
file_format = file_format[:-1] if file_format == 'yoda1' else file_format
file_to_send = get_file_in_directory(new_path, file_format)
else:
# Error occurred, the output is a HTML file
file_to_send = output_path + '-dir'
file_format = 'html'
return send_file(file_to_send, as_attachment=True,
download_name=filename + '.' + file_format)
[docs]
def display_error(title='Unknown Error', description=''):
"""
Return an HTML page containing a description of the conversion error.
:param title:
:param description:
:return: render_template
"""
return render_template(
'hepdata_records/error_page.html',
header_message='Converter error encountered',
message=title,
errors={
"Converter": [{
"level": "error",
"message": description
}]
}
)
[docs]
def create_original_with_resources(submission, data_filepath, output_path):
"""Copy or create 'original' zip file, i.e. yaml files with resources. If
resources were imported from hepdata.cedar.ac.uk we create a new zip
in a format that could be re-uploaded as a submission.
:param type submission: HEPSubmission object
:param type data_filepath: Path to original file
:param type output_path: Path to output file (in converted dir)
:return: None
"""
resource_location = os.path.join(
get_data_path_for_record(str(submission.publication_recid)),
'resources'
)
if os.path.isdir(resource_location):
# There is a resources directory from when this record was imported
# from the old hepdata site. We need to create a new zip with the
# contents of data_filepath and resources
with tempfile.TemporaryDirectory(dir=current_app.config['CFG_TMPDIR']) as tmpdir:
# Copy resources directory into 'contents' dir in temp directory
contents_path = os.path.join(tmpdir, 'contents')
shutil.copytree(resource_location, contents_path)
# Unzip data_filepath into contents path
shutil.unpack_archive(data_filepath, contents_path)
# Need to go through the submission file and update the paths so
# that all resources are at the top level. This should allow the
# zip to be re-uploaded or imported
submission_found = find_file_in_directory(
contents_path,
lambda x: x == "submission.yaml"
)
if submission_found:
with fileinput.FileInput(submission_found[1], inplace=True) as file:
p = re.compile(r'(\s+location: )\/resource\/.*\/([^\/]+)')
for line in file:
print(p.sub(r'\g<1>\g<2>', line), end='')
# Zip up contents dir into a new file
base, ext = os.path.splitext(output_path)
zip_type = 'zip' if ext == '.zip' else 'gztar'
print("Creating archive at %s" % output_path)
shutil.make_archive(base, zip_type, contents_path)
else:
shutil.copy2(data_filepath, output_path)
[docs]
def get_version_count(recid):
"""
Returns both the number of *allowed* versions and the number of *all* versions.
:param recid:
:return: version_count, version_count_all
"""
# Count number of all versions and number of finished versions of a publication record.
version_count_all = HEPSubmission.query.filter_by(publication_recid=recid).count()
version_count_finished = HEPSubmission.query.filter_by(publication_recid=recid, overall_status='finished').count()
version_count_sandbox = HEPSubmission.query.filter(
HEPSubmission.publication_recid == recid,
or_(HEPSubmission.overall_status == 'sandbox', HEPSubmission.overall_status == 'sandbox_processing')
).count()
if version_count_sandbox:
# For a Sandbox record, there is only one version, which is accessible by everyone.
version_count = version_count_all
else:
# Number of versions that a user is allowed to access based on their permissions.
version_count = version_count_all if user_allowed_to_perform_action(recid) else version_count_finished
return version_count, version_count_all
[docs]
def guess_rivet_analysis_name(submission):
"""
Try to guess the Rivet analysis name.
:param submission: HEPSubmission object
:return: guessed Rivet analysis name
"""
rivet_analysis_name = ''
# Check if this submission has a Rivet analysis as additional resources,
# then extract the Rivet analysis name from the URL.
for resource in submission.resources:
if resource.file_type == 'rivet':
rivet_analysis_name = resource.file_location.split('/')[-1]
if not rivet_analysis_name:
# Otherwise guess the Rivet analysis name using the collaboration name,
# the creation year of the INSPIRE record, and the INSPIRE ID.
record = get_record_contents(submission.publication_recid,
submission.overall_status)
if record and 'inspire_id' in record and record['inspire_id']:
try:
year = parse(record['creation_date']).year
except:
year = record['year'] # publication year
rivet_analysis_name = '{0}_{1}_I{2}'.format(''.join(
record['collaborations']).upper(), year, record['inspire_id'])
return rivet_analysis_name