api/files.py
import os
import cgi
import json
import shutil
import hashlib
import collections
from backports import tempfile
from . import util
from . import config
DEFAULT_HASH_ALG='sha384'
def move_file(path, target_path):
target_dir = os.path.dirname(target_path)
if not os.path.exists(target_dir):
os.makedirs(target_dir)
shutil.move(path, target_path)
def move_form_file_field_into_cas(file_field):
"""
Given a file form field, move the (downloaded, tempdir-stored) file into the CAS.
Requires an augmented file field; see upload.process_upload() for details.
"""
if not file_field.hash or not file_field.path:
raise Exception("Field is not a file field with hash and path")
base = config.get_item('persistent', 'data_path')
cas = util.path_from_hash(file_field.hash)
move_file(file_field.path, os.path.join(base, cas))
def hash_file_formatted(path, hash_alg=None, buffer_size=65536):
"""
Return the scitran-formatted hash of a file, specified by path.
"""
hash_alg = hash_alg or DEFAULT_HASH_ALG
hasher = hashlib.new(hash_alg)
with open(path, 'rb') as f:
while True:
data = f.read(buffer_size)
if not data:
break
hasher.update(data)
return util.format_hash(hash_alg, hasher.hexdigest())
class HashingFile(file):
def __init__(self, file_path, hash_alg):
super(HashingFile, self).__init__(file_path, "wb")
self.hash_alg = hashlib.new(hash_alg)
self.hash_name = hash_alg
def write(self, data):
self.hash_alg.update(data)
return file.write(self, data)
def get_hash(self):
return self.hash_alg.hexdigest()
def get_formatted_hash(self):
return util.format_hash(self.hash_name, self.get_hash())
ParsedFile = collections.namedtuple('ParsedFile', ['info', 'path'])
def process_form(request, hash_alg=None):
"""
Some workarounds to make webapp2 process forms in an intelligent way,
and hash files we process.
Normally webapp2/WebOb Reqest.POST would copy the entire request stream
into a single file on disk.
https://github.com/Pylons/webob/blob/cb9c0b4f51542a7d0ed5cc5bf0a73f528afbe03e/webob/request.py#L787
https://github.com/moraes/webapp-improved/pull/12
We pass request.body_file (wrapped wsgi input stream)
to our custom subclass of cgi.FieldStorage to write each upload file
to a separate file on disk, as it comes in off the network stream from the client.
Then we can rename these files to their final destination,
without copying the data gain.
Returns (tuple):
form: HashingFieldStorage instance
tempdir: tempdir the file was stored in.
Keep tempdir in scope until you don't need it anymore; it will be deleted on GC.
"""
hash_alg = hash_alg or DEFAULT_HASH_ALG
# Store form file fields in a tempdir
tempdir = tempfile.TemporaryDirectory(prefix='.tmp', dir=config.get_item('persistent', 'data_path'))
# Copied from WebOb source:
# https://github.com/Pylons/webob/blob/cb9c0b4f51542a7d0ed5cc5bf0a73f528afbe03e/webob/request.py#L790
env = request.environ.copy()
env.setdefault('CONTENT_LENGTH', '0')
env['QUERY_STRING'] = ''
field_storage_class = getHashingFieldStorage(
tempdir.name, DEFAULT_HASH_ALG
)
form = field_storage_class(
fp=request.body_file, environ=env, keep_blank_values=True
)
return (form, tempdir)
def getHashingFieldStorage(upload_dir, hash_alg):
# pylint: disable=attribute-defined-outside-init
# We dynamically create this class because we
# can't add arguments to __init__.
# This is due to the FieldStorage we create
# in turn creating a FieldStorage for different
# parts of the form, with a hardcoded set of args
# https://github.com/python/cpython/blob/1e3e162ff5c0cc656559c43914439ab3e5734f00/Lib/cgi.py#L696
# https://github.com/python/cpython/blob/1e3e162ff5c0cc656559c43914439ab3e5734f00/Lib/cgi.py#L728
class HashingFieldStorage(cgi.FieldStorage):
bufsize = 2**20
def make_file(self, binary=None):
# Sanitize form's filename (read: prevent malicious escapes, bad characters, etc)
self.filename = os.path.basename(self.filename)
# self.filename = util.sanitize_string_to_filename(self.filename)
self.open_file = HashingFile(os.path.join(upload_dir, self.filename), hash_alg)
return self.open_file
# override private method __write of superclass FieldStorage
# _FieldStorage__file is the private variable __file of the same class
def _FieldStorage__write(self, line):
# pylint: disable=access-member-before-definition
if self._FieldStorage__file is not None:
# Always write fields of type "file" to disk for consistent renaming behavior
if self.filename:
self.file = self.make_file('')
self.file.write(self._FieldStorage__file.getvalue())
self._FieldStorage__file = None
self.file.write(line)
def get_hash(self):
return self.open_file.get_hash()
return HashingFieldStorage
# File extension --> scitran file type detection hueristics.
# Listed in precendence order.
with open(os.path.join(os.path.dirname(__file__), 'filetypes.json')) as fd:
TYPE_MAP = json.load(fd)
KNOWN_FILETYPES = {ext: filetype for filetype, extensions in TYPE_MAP.iteritems() for ext in extensions}
def guess_type_from_filename(filename):
particles = filename.split('.')[1:]
extentions = ['.' + '.'.join(particles[i:]) for i in range(len(particles))]
for ext in extentions:
filetype = KNOWN_FILETYPES.get(ext.lower())
if filetype:
break
else:
filetype = None
return filetype