nenuscanner/archive.py

import builtins
from datetime import datetime
from flask import Response, Blueprint
import functools
import itertools
import os
from os.path import join
import zlib
from typing import Optional
import time
from . import db, config

# Chunks for crc 32 computation
CRC32_CHUNK_SIZE = 65_536

# 4MiB chunks
CHUNK_SIZE = 4_194_304

# ASCII value for space
SPACE = ord(' ')

# ASCII value for zero
ZERO = ord('0')


def tar_header_chunk(filename: str, filepath: str) -> bytes:
    """
    Returns the 512 bytes header for a tar file in a tar archive.

    Args:
        filename (str): path of where the file will be in the archive.
        filepath (str): path of where the file is currently on the disk.
    """

    # Returns the octal representation without the initial
    def oct(i: int) -> str:
        return builtins.oct(i)[2:]

    stat = os.stat(filepath)
    buffer = bytearray(512)

    # Field 1: filename on 100 bytes
    buffer[0:len(filename)] = filename.encode('ascii')

    # Field 2: mode, on 8 bytes, octal, last byte must be \x00, so we set only the first 7 bytes
    buffer[100:107] = oct(stat.st_mode).rjust(7, '0').encode('ascii')

    # Field 3: owner, on 8 bytes, octal, last byte must be \x00, so we set only the first 7 bytes
    buffer[108:115] = oct(stat.st_uid).rjust(7, '0').encode('ascii')

    # Field 4: group, on 8 bytes, octal, last byte must be \x00, so we set only the first 7 bytes
    buffer[116:123] = oct(stat.st_gid).rjust(7, '0').encode('ascii')

    # Field 5: file size in bytes, on 12 bytes, octal, last byte must be \x00, so we set only the first 11 bytes
    buffer[124:135] = oct(stat.st_size).rjust(11, '0').encode('ascii')

    # Field 6: last modified, on 12 bytes, octal, last byte must be \x00, so we set only the first 11 bytes
    buffer[136:147] = oct(int(stat.st_mtime)).rjust(11, '0').encode('ascii')

    # Field 7: checksum, we fill it at the end

    # Field 8: type flag, 0 because we only have regular files
    buffer[156] = ZERO

    # Field 9: linkname, \x00s because we only have regular files

    # POSIX 1003.1-1990: 255 empty bytes

    # Compute the checksum: we start at 256 which are the 8 fields of checksum filled with spaces (32 * 8)
    checksum = oct(functools.reduce(lambda x, y: x + y, buffer, 256)).rjust(6, '0').encode('ascii')
    buffer[148:154] = checksum

    # Don't ask me why, but the checksum must end with b'\x00 ', so we skip the \x00 and write the space
    buffer[155] = SPACE

    return bytes(buffer)


class ArchiveSender:
    """
    Helper class to send archives over the network.

    This class is abstract, and needs to be derived by specific archive sender classes.
    """

    def __init__(self):
        """
        Creates a new archive sender.
        """
        self.files: dict[str, str] = {}

    def add_file(self, filename: str, filepath: str):
        """
        Adds a file to the archive.

        Args:
            filename (str): path of where the file will be in the archive.
            filepath (str): path of where the file is currently on the disk.
        """
        self.files[filename] = filepath

    def content_length(self) -> Optional[int]:
        """
        Returns the size of the archive if it is computable beforehand, none otherwise.
        """
        return None

    def generator(self):
        """
        Returns a generator that yields the bytes of the archive.
        """
        raise NotImplementedError("Abstract method")

    def mime_type(self) -> str:
        """
        Returns the mime type of the archive.
        """
        raise NotImplementedError("Abstract method")

    def archive_name(self) -> str:
        """
        Returns the name of the archive.

        This method is useful for web applications where the archive will be downloaded.
        """
        raise NotImplementedError("Abstract method")

    def response(self) -> Response:
        """
        Returns a flask reponse for the archive.
        """
        headers = {'Content-Disposition': f'attachment; filename="{self.archive_name()}"'}

        length = self.content_length()
        if length is not None:
            headers['Content-Length'] = str(length)

        return Response(
            self.generator(),
            mimetype=self.mime_type(),
            headers=headers,
        )


class TarSender(ArchiveSender):
    """
    A sender for tar archives computed on the fly.
    """

    def generator(self):
        def generate():
            for name, file in self.files.items():
                yield tar_header_chunk(name, file)

                bytes_sent = 0

                with open(file, 'rb') as f:
                    while True:
                        bytes = f.read(CHUNK_SIZE)

                        if len(bytes) == 0:
                            break

                        bytes_sent += len(bytes)
                        yield bytes

                    # Because tar use records of 512 bytes, we need to pad the
                    # file with zeroes to fill the last chunk
                    yield b'\x00' * (512 - bytes_sent % 512)
        return generate()

    def mime_type(self) -> str:
        return 'application/x-tar'

    def archive_name(self) -> str:
        return 'archive.tar'

    def content_length(self) -> int:
        length = 0

        for file in self.files.values():
            stat = os.stat(file)

            # Add size of header, and size of content ceiled to 512 bytes
            length += 512 + stat.st_size + (512 - stat.st_size % 512)

        return length


def crc32(filename) -> int:
    """
    Computes the CRC32 checksum for the file.

    Args:
        filename (str): path to the file of which the CRC32 needs to be computed.
    """
    with open(filename, 'rb') as fh:
        hash = 0
        while True:
            s = fh.read(CRC32_CHUNK_SIZE)
            if not s:
                break
            hash = zlib.crc32(s, hash)
        return hash


def zip_local_file_header(filename: str, filepath: str, crc: int) -> bytes:
    """
    Generates the bytes for the local file header of the file.

    Args:
        filename (str): path of where the file will be in the archive.
        filepath (str): path of where the file is currently on the disk.
        crc (int):
            the CRC 32 checksum of the file. It is not computed by this function because it is also required in the
            central directory file header, so the user of this function should compute it beforehand, and reuse it later
            to avoid computing it twice.
    """
    buffer_size = 30 + len(filename)
    buffer = bytearray(buffer_size)
    stat = os.stat(filepath)

    # Field 1: local file header signature (buffer[0:4])
    buffer[0:4] = b'\x50\x4b\x03\x04'

    # Field 2: version needed to extract (minimum) (buffer[4:6])
    buffer[4:6] = b'\x0a'

    # Field 3: general purpose bit flag (buffer[6:8]), leave at 0

    # Field 4: compression mode (buffer[8:10]), leave at 0 (uncompressed)

    # Field 5: file last modification time (buffer[10:14])
    mtime = datetime.fromtimestamp(stat.st_mtime)
    buffer[10:12] = ((mtime.second // 2) | (mtime.minute << 5) | (mtime.hour << 11)).to_bytes(2, byteorder='little')
    buffer[12:14] = (mtime.day | (mtime.month << 5) | ((mtime.year - 1980) << 9)).to_bytes(2, byteorder='little')

    # Field 6: crc-32 of uncompressed data (buffer[14:18])
    buffer[14:18] = crc.to_bytes(4, byteorder='little')

    # Field 7: compressed size (buffer[18:22])
    buffer[18:22] = stat.st_size.to_bytes(4, byteorder='little')

    # Field 8: uncompressed size (buffer[22:26])
    buffer[22:26] = stat.st_size.to_bytes(4, byteorder='little')

    # Field 9: filename length (buffer[26:28])
    buffer[26:28] = len(filename).to_bytes(2, byteorder='little')

    # Field 10: extra field length (buffer[28:30])

    # Field 11: filename (buffer[30:30+len(filename)])
    buffer[30:30+len(filename)] = filename.encode('ascii')

    return bytes(buffer)


def zip_central_directory_file_header(filename: str, filepath: str, crc: int, offset: int) -> bytes:
    """
    Generates the bytes for the central directory file header of the file.

    Args:
        filename (str): path of where the file will be in the archive.
        filepath (str): path of where the file is currently on the disk.
        crc (int):
            the CRC 32 checksum of the file. It is not computed by this function because it is also required in the
            local file header, so the user of this function should compute it beforehand, and reuse it later to avoid
            computing it twice.
        offset (int): number of bytes where the file starts.
    """
    buffer_size = 46 + len(filename)
    buffer = bytearray(buffer_size)
    stat = os.stat(filepath)

    # Field 1: central directory file header signature (buffer[0:4])
    buffer[0:4] = b'\x50\x4b\x01\x02'

    # Field 2: version made by (buffer[4:6])
    buffer[4:6] = b'\x0a'

    # Field 3: version needed to extract (minimum) (buffer[6:8])
    buffer[6:8] = b'\x0a'

    # Field 3: general purpose bit flag (buffer[8:10]), leave at 0

    # Field 4: compression mode (buffer[10:12]), leave at 0 (uncompressed)

    # Field 5: file last modification time (buffer[12:16])
    mtime = datetime.fromtimestamp(stat.st_mtime)
    buffer[12:14] = ((mtime.second // 2) | (mtime.minute << 5) | (mtime.hour << 11)).to_bytes(2, byteorder='little')
    buffer[14:16] = (mtime.day | (mtime.month << 5) | ((mtime.year - 1980) << 9)).to_bytes(2, byteorder='little')

    # Field 6: crc-32 of uncompressed data (buffer[16:20])
    buffer[16:20] = crc.to_bytes(4, byteorder='little')

    # Field 7: compressed size (buffer[20:24])
    buffer[20:24] = stat.st_size.to_bytes(4, byteorder='little')

    # Field 8: uncompressed size (buffer[24:28])
    buffer[24:28] = stat.st_size.to_bytes(4, byteorder='little')

    # Field 9: filename length (buffer[28:30])
    buffer[28:30] = len(filename).to_bytes(2, byteorder='little')

    # Field 10: extra field length (buffer[30:32])

    # Field 11: file comment length (buffer[32:34])

    # Field 12: disk number where file starts (buffer[34:36])

    # Field 13: internal file attributes (buffer[36:38])

    # Field 14: external file attributes (buffer[38:42])

    # Field 15: relative offset of the local file header (buffer[42:46])
    buffer[42:46] = offset.to_bytes(4, byteorder='little')

    # Field 16: filename (buffer[46:46+len(filename)])
    buffer[46:46+len(filename)] = filename.encode('ascii')

    return bytes(buffer)


def zip_end_of_central_directory(items_number: int, central_directory_size: int, central_directory_offset: int):
    """
    Generates the bytes for the end of central directory of the archive.

    Args:
        items_number (int): number of files in the archive.
        central_directory_size (int): size in bytes of the central directory.
        central_directory_offset (int): number of the byte where the central directory starts.
    """
    buffer = bytearray(22)
    # Field 1: End of central directory signature = 0x06054b50 (buffer[0:4])
    buffer[0:4] = b'\x50\x4b\x05\x06'

    # Field 2: Number of this disk (buffer[4:6])

    # Field 3: Disk where central directory starts (buffer[6:8])

    # Field 4: Number of central directory records on this disk (buffer[8:10])
    buffer[8:10] = items_number.to_bytes(2, byteorder='little')

    # Field 5: Total number of central directory records (buffer[10:12])
    buffer[10:12] = items_number.to_bytes(2, byteorder='little')

    # Field 6: Size of central directory in bytes (buffer[12:16])
    buffer[12:16] = central_directory_size.to_bytes(4, byteorder='little')

    # Field 7: Offset of start of central directory (buffer[16:20])
    buffer[16:20] = central_directory_offset.to_bytes(4, byteorder='little')

    # Field 8: Comment length (buffer[20:22])

    # Field 9: Comment (buffer[22:])
    return bytes(buffer)


class ZipSender(ArchiveSender):
    """
    A sender for zip archives computed on the fly.
    """

    def generator(self):
        def generate():
            local_offsets = dict()
            crcs = dict()
            current_byte = 0

            for name, file in self.files.items():
                crcs[name] = crc32(file)

                local_offsets[name] = current_byte
                chunk = zip_local_file_header(name, file, crcs[name])
                current_byte += len(chunk)

                yield chunk

                with open(file, 'rb') as f:
                    while True:
                        bytes = f.read(CHUNK_SIZE)

                        if len(bytes) == 0:
                            break

                        current_byte += len(bytes)
                        yield bytes
                        time.sleep(1)

            central_directory_size = 0
            centra_directory_offset = current_byte

            for name, file, in self.files.items():
                chunk = zip_central_directory_file_header(name, file, crcs[name], local_offsets[name])
                central_directory_size += len(chunk)
                current_byte += len(chunk)
                yield chunk

            yield zip_end_of_central_directory(len(self.files.items()), central_directory_size, centra_directory_offset)

        return generate()

    def content_length(self) -> int:
        length = 0

        for name, file in self.files.items():
            stat = os.stat(file)

            # Add size of local file header, central directory file header and file size
            length += 76 + 2 * len(name) + stat.st_size

        # Add size of end of central directory
        return length + 22

    def mime_type(self) -> str:
        return 'application/zip'

    def archive_name(self) -> str:
        return 'archive.zip'


def download_object(id: int, archive: ArchiveSender):
    """
    Helper for routes that send archives.
    """
    conn = db.get()
    object = db.Object.get_from_id(id, conn).full(conn)

    # Group acquisitions sharing calibration
    def keyfunc(x: db.Calibration) -> int:
        return x.calibration_id

    acquisitions_sorted = sorted(object.acquisitions, key=keyfunc)
    acquisitions_grouped = [
        (db.Calibration.get_from_id(k, conn), list(g))
        for k, g in itertools.groupby(acquisitions_sorted, key=keyfunc)
    ]

    # Create archive file to send
    for calibration_index, (calib, acquisitions) in enumerate(acquisitions_grouped):
        calibration_dir = join(config.CALIBRATION_DIR, str(calib.id))

        # Add calibration images
        for image in os.listdir(calibration_dir):
            archive.add_file(
                f'object/{calibration_index}/calibration/{image}',
                join(calibration_dir, image)
            )

        # Add each acquisition
        for acquisition_index, acquisition in enumerate(acquisitions):
            acquisition_dir = join(config.OBJECT_DIR, str(object.id), str(acquisition.id))

            for image in os.listdir(acquisition_dir):
                archive.add_file(
                    f'object/{calibration_index}/{acquisition_index}/{image}',
                    join(acquisition_dir, image)
                )

    return archive.response()


blueprint = Blueprint('archive', __name__)


@blueprint.route('/download-object/tar/<id>')
def download_object_tar(id: int):
    """
    Downloads an object as a tar archive.
    """
    return download_object(id, TarSender())


@blueprint.route('/download-object/zip/<id>')
def download_object_zip(id: int):
    """
    Downloads an object as a zip archive.
    """
    return download_object(id, ZipSender())