nenuscanner/archive.py

480 lines
15 KiB
Python

import builtins
from datetime import datetime
from flask import Response, Blueprint
import functools
import itertools
import os
from os.path import join
import zlib
from typing import Optional
import time
from . import db, config
# Chunks for crc 32 computation
CRC32_CHUNK_SIZE = 65_536
# 4MiB chunks
CHUNK_SIZE = 4_194_304
# ASCII value for space
SPACE = ord(' ')
# ASCII value for zero
ZERO = ord('0')
def tar_header_chunk(filename: str, filepath: str) -> bytes:
"""
Returns the 512 bytes header for a tar file in a tar archive.
Args:
filename (str): path of where the file will be in the archive.
filepath (str): path of where the file is currently on the disk.
"""
# Returns the octal representation without the initial
def oct(i: int) -> str:
return builtins.oct(i)[2:]
stat = os.stat(filepath)
buffer = bytearray(512)
# Field 1: filename on 100 bytes
buffer[0:len(filename)] = filename.encode('ascii')
# Field 2: mode, on 8 bytes, octal, last byte must be \x00, so we set only the first 7 bytes
buffer[100:107] = oct(stat.st_mode).rjust(7, '0').encode('ascii')
# Field 3: owner, on 8 bytes, octal, last byte must be \x00, so we set only the first 7 bytes
buffer[108:115] = oct(stat.st_uid).rjust(7, '0').encode('ascii')
# Field 4: group, on 8 bytes, octal, last byte must be \x00, so we set only the first 7 bytes
buffer[116:123] = oct(stat.st_gid).rjust(7, '0').encode('ascii')
# Field 5: file size in bytes, on 12 bytes, octal, last byte must be \x00, so we set only the first 11 bytes
buffer[124:135] = oct(stat.st_size).rjust(11, '0').encode('ascii')
# Field 6: last modified, on 12 bytes, octal, last byte must be \x00, so we set only the first 11 bytes
buffer[136:147] = oct(int(stat.st_mtime)).rjust(11, '0').encode('ascii')
# Field 7: checksum, we fill it at the end
# Field 8: type flag, 0 because we only have regular files
buffer[156] = ZERO
# Field 9: linkname, \x00s because we only have regular files
# POSIX 1003.1-1990: 255 empty bytes
# Compute the checksum: we start at 256 which are the 8 fields of checksum filled with spaces (32 * 8)
checksum = oct(functools.reduce(lambda x, y: x + y, buffer, 256)).rjust(6, '0').encode('ascii')
buffer[148:154] = checksum
# Don't ask me why, but the checksum must end with b'\x00 ', so we skip the \x00 and write the space
buffer[155] = SPACE
return bytes(buffer)
class ArchiveSender:
"""
Helper class to send archives over the network.
This class is abstract, and needs to be derived by specific archive sender classes.
"""
def __init__(self):
"""
Creates a new archive sender.
"""
self.files: dict[str, str] = {}
def add_file(self, filename: str, filepath: str):
"""
Adds a file to the archive.
Args:
filename (str): path of where the file will be in the archive.
filepath (str): path of where the file is currently on the disk.
"""
self.files[filename] = filepath
def content_length(self) -> Optional[int]:
"""
Returns the size of the archive if it is computable beforehand, none otherwise.
"""
return None
def generator(self):
"""
Returns a generator that yields the bytes of the archive.
"""
raise NotImplementedError("Abstract method")
def mime_type(self) -> str:
"""
Returns the mime type of the archive.
"""
raise NotImplementedError("Abstract method")
def archive_name(self) -> str:
"""
Returns the name of the archive.
This method is useful for web applications where the archive will be downloaded.
"""
raise NotImplementedError("Abstract method")
def response(self) -> Response:
"""
Returns a flask reponse for the archive.
"""
headers = {'Content-Disposition': f'attachment; filename="{self.archive_name()}"'}
length = self.content_length()
if length is not None:
headers['Content-Length'] = str(length)
return Response(
self.generator(),
mimetype=self.mime_type(),
headers=headers,
)
class TarSender(ArchiveSender):
"""
A sender for tar archives computed on the fly.
"""
def generator(self):
def generate():
for name, file in self.files.items():
yield tar_header_chunk(name, file)
bytes_sent = 0
with open(file, 'rb') as f:
while True:
bytes = f.read(CHUNK_SIZE)
if len(bytes) == 0:
break
bytes_sent += len(bytes)
yield bytes
# Because tar use records of 512 bytes, we need to pad the
# file with zeroes to fill the last chunk
yield b'\x00' * (512 - bytes_sent % 512)
return generate()
def mime_type(self) -> str:
return 'application/x-tar'
def archive_name(self) -> str:
return 'archive.tar'
def content_length(self) -> int:
length = 0
for file in self.files.values():
stat = os.stat(file)
# Add size of header, and size of content ceiled to 512 bytes
length += 512 + stat.st_size + (512 - stat.st_size % 512)
return length
def crc32(filename) -> int:
"""
Computes the CRC32 checksum for the file.
Args:
filename (str): path to the file of which the CRC32 needs to be computed.
"""
with open(filename, 'rb') as fh:
hash = 0
while True:
s = fh.read(CRC32_CHUNK_SIZE)
if not s:
break
hash = zlib.crc32(s, hash)
return hash
def zip_local_file_header(filename: str, filepath: str, crc: int) -> bytes:
"""
Generates the bytes for the local file header of the file.
Args:
filename (str): path of where the file will be in the archive.
filepath (str): path of where the file is currently on the disk.
crc (int):
the CRC 32 checksum of the file. It is not computed by this function because it is also required in the
central directory file header, so the user of this function should compute it beforehand, and reuse it later
to avoid computing it twice.
"""
buffer_size = 30 + len(filename)
buffer = bytearray(buffer_size)
stat = os.stat(filepath)
# Field 1: local file header signature (buffer[0:4])
buffer[0:4] = b'\x50\x4b\x03\x04'
# Field 2: version needed to extract (minimum) (buffer[4:6])
buffer[4:6] = b'\x0a'
# Field 3: general purpose bit flag (buffer[6:8]), leave at 0
# Field 4: compression mode (buffer[8:10]), leave at 0 (uncompressed)
# Field 5: file last modification time (buffer[10:14])
mtime = datetime.fromtimestamp(stat.st_mtime)
buffer[10:12] = ((mtime.second // 2) | (mtime.minute << 5) | (mtime.hour << 11)).to_bytes(2, byteorder='little')
buffer[12:14] = (mtime.day | (mtime.month << 5) | ((mtime.year - 1980) << 9)).to_bytes(2, byteorder='little')
# Field 6: crc-32 of uncompressed data (buffer[14:18])
buffer[14:18] = crc.to_bytes(4, byteorder='little')
# Field 7: compressed size (buffer[18:22])
buffer[18:22] = stat.st_size.to_bytes(4, byteorder='little')
# Field 8: uncompressed size (buffer[22:26])
buffer[22:26] = stat.st_size.to_bytes(4, byteorder='little')
# Field 9: filename length (buffer[26:28])
buffer[26:28] = len(filename).to_bytes(2, byteorder='little')
# Field 10: extra field length (buffer[28:30])
# Field 11: filename (buffer[30:30+len(filename)])
buffer[30:30+len(filename)] = filename.encode('ascii')
return bytes(buffer)
def zip_central_directory_file_header(filename: str, filepath: str, crc: int, offset: int) -> bytes:
"""
Generates the bytes for the central directory file header of the file.
Args:
filename (str): path of where the file will be in the archive.
filepath (str): path of where the file is currently on the disk.
crc (int):
the CRC 32 checksum of the file. It is not computed by this function because it is also required in the
local file header, so the user of this function should compute it beforehand, and reuse it later to avoid
computing it twice.
offset (int): number of bytes where the file starts.
"""
buffer_size = 46 + len(filename)
buffer = bytearray(buffer_size)
stat = os.stat(filepath)
# Field 1: central directory file header signature (buffer[0:4])
buffer[0:4] = b'\x50\x4b\x01\x02'
# Field 2: version made by (buffer[4:6])
buffer[4:6] = b'\x0a'
# Field 3: version needed to extract (minimum) (buffer[6:8])
buffer[6:8] = b'\x0a'
# Field 3: general purpose bit flag (buffer[8:10]), leave at 0
# Field 4: compression mode (buffer[10:12]), leave at 0 (uncompressed)
# Field 5: file last modification time (buffer[12:16])
mtime = datetime.fromtimestamp(stat.st_mtime)
buffer[12:14] = ((mtime.second // 2) | (mtime.minute << 5) | (mtime.hour << 11)).to_bytes(2, byteorder='little')
buffer[14:16] = (mtime.day | (mtime.month << 5) | ((mtime.year - 1980) << 9)).to_bytes(2, byteorder='little')
# Field 6: crc-32 of uncompressed data (buffer[16:20])
buffer[16:20] = crc.to_bytes(4, byteorder='little')
# Field 7: compressed size (buffer[20:24])
buffer[20:24] = stat.st_size.to_bytes(4, byteorder='little')
# Field 8: uncompressed size (buffer[24:28])
buffer[24:28] = stat.st_size.to_bytes(4, byteorder='little')
# Field 9: filename length (buffer[28:30])
buffer[28:30] = len(filename).to_bytes(2, byteorder='little')
# Field 10: extra field length (buffer[30:32])
# Field 11: file comment length (buffer[32:34])
# Field 12: disk number where file starts (buffer[34:36])
# Field 13: internal file attributes (buffer[36:38])
# Field 14: external file attributes (buffer[38:42])
# Field 15: relative offset of the local file header (buffer[42:46])
buffer[42:46] = offset.to_bytes(4, byteorder='little')
# Field 16: filename (buffer[46:46+len(filename)])
buffer[46:46+len(filename)] = filename.encode('ascii')
return bytes(buffer)
def zip_end_of_central_directory(items_number: int, central_directory_size: int, central_directory_offset: int):
"""
Generates the bytes for the end of central directory of the archive.
Args:
items_number (int): number of files in the archive.
central_directory_size (int): size in bytes of the central directory.
central_directory_offset (int): number of the byte where the central directory starts.
"""
buffer = bytearray(22)
# Field 1: End of central directory signature = 0x06054b50 (buffer[0:4])
buffer[0:4] = b'\x50\x4b\x05\x06'
# Field 2: Number of this disk (buffer[4:6])
# Field 3: Disk where central directory starts (buffer[6:8])
# Field 4: Number of central directory records on this disk (buffer[8:10])
buffer[8:10] = items_number.to_bytes(2, byteorder='little')
# Field 5: Total number of central directory records (buffer[10:12])
buffer[10:12] = items_number.to_bytes(2, byteorder='little')
# Field 6: Size of central directory in bytes (buffer[12:16])
buffer[12:16] = central_directory_size.to_bytes(4, byteorder='little')
# Field 7: Offset of start of central directory (buffer[16:20])
buffer[16:20] = central_directory_offset.to_bytes(4, byteorder='little')
# Field 8: Comment length (buffer[20:22])
# Field 9: Comment (buffer[22:])
return bytes(buffer)
class ZipSender(ArchiveSender):
"""
A sender for zip archives computed on the fly.
"""
def generator(self):
def generate():
local_offsets = dict()
crcs = dict()
current_byte = 0
for name, file in self.files.items():
crcs[name] = crc32(file)
local_offsets[name] = current_byte
chunk = zip_local_file_header(name, file, crcs[name])
current_byte += len(chunk)
yield chunk
with open(file, 'rb') as f:
while True:
bytes = f.read(CHUNK_SIZE)
if len(bytes) == 0:
break
current_byte += len(bytes)
yield bytes
time.sleep(1)
central_directory_size = 0
centra_directory_offset = current_byte
for name, file, in self.files.items():
chunk = zip_central_directory_file_header(name, file, crcs[name], local_offsets[name])
central_directory_size += len(chunk)
current_byte += len(chunk)
yield chunk
yield zip_end_of_central_directory(len(self.files.items()), central_directory_size, centra_directory_offset)
return generate()
def content_length(self) -> int:
length = 0
for name, file in self.files.items():
stat = os.stat(file)
# Add size of local file header, central directory file header and file size
length += 76 + 2 * len(name) + stat.st_size
# Add size of end of central directory
return length + 22
def mime_type(self) -> str:
return 'application/zip'
def archive_name(self) -> str:
return 'archive.zip'
def download_object(id: int, archive: ArchiveSender):
"""
Helper for routes that send archives.
"""
conn = db.get()
object = db.Object.get_from_id(id, conn).full(conn)
# Group acquisitions sharing calibration
def keyfunc(x: db.Calibration) -> int:
return x.calibration_id
acquisitions_sorted = sorted(object.acquisitions, key=keyfunc)
acquisitions_grouped = [
(db.Calibration.get_from_id(k, conn), list(g))
for k, g in itertools.groupby(acquisitions_sorted, key=keyfunc)
]
# Create archive file to send
for calibration_index, (calib, acquisitions) in enumerate(acquisitions_grouped):
calibration_dir = join(config.CALIBRATION_DIR, str(calib.id))
# Add calibration images
for image in os.listdir(calibration_dir):
archive.add_file(
f'object/{calibration_index}/calibration/{image}',
join(calibration_dir, image)
)
# Add each acquisition
for acquisition_index, acquisition in enumerate(acquisitions):
acquisition_dir = join(config.OBJECT_DIR, str(object.id), str(acquisition.id))
for image in os.listdir(acquisition_dir):
archive.add_file(
f'object/{calibration_index}/{acquisition_index}/{image}',
join(acquisition_dir, image)
)
return archive.response()
blueprint = Blueprint('archive', __name__)
@blueprint.route('/download-object/tar/<id>')
def download_object_tar(id: int):
"""
Downloads an object as a tar archive.
"""
return download_object(id, TarSender())
@blueprint.route('/download-object/zip/<id>')
def download_object_zip(id: int):
"""
Downloads an object as a zip archive.
"""
return download_object(id, ZipSender())