synapse-s3-storage-provider/s3_storage_provider.py

# -*- coding: utf-8 -*-
# Copyright 2018 New Vector Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os
import threading

from six import string_types

import boto3
import botocore

from twisted.internet import defer, reactor
from twisted.python.failure import Failure
from twisted.python.threadpool import ThreadPool

from synapse.logging.context import LoggingContext, make_deferred_yieldable
from synapse.rest.media.v1._base import Responder
from synapse.rest.media.v1.storage_provider import StorageProvider

# Synapse 1.13.0 moved current_context to a module-level function.
try:
    from synapse.logging.context import current_context
except ImportError:
    current_context = LoggingContext.current_context

logger = logging.getLogger("synapse.s3")


# The list of valid AWS storage class names
_VALID_STORAGE_CLASSES = (
    "STANDARD",
    "REDUCED_REDUNDANCY",
    "STANDARD_IA",
    "INTELLIGENT_TIERING",
)

# Chunk size to use when reading from s3 connection in bytes
READ_CHUNK_SIZE = 16 * 1024


class S3StorageProviderBackend(StorageProvider):
    """
    Args:
        hs (HomeServer)
        config: The config returned by `parse_config`
    """

    def __init__(self, hs, config):
        self.cache_directory = hs.config.media_store_path
        self.bucket = config["bucket"]
        self.storage_class = config["storage_class"]
        self.api_kwargs = {}

        if "region_name" in config:
            self.api_kwargs["region_name"] = config["region_name"]

        if "endpoint_url" in config:
            self.api_kwargs["endpoint_url"] = config["endpoint_url"]

        if "access_key_id" in config:
            self.api_kwargs["aws_access_key_id"] = config["access_key_id"]

        if "secret_access_key" in config:
            self.api_kwargs["aws_secret_access_key"] = config["secret_access_key"]

        threadpool_size = config.get("threadpool_size", 40)
        self._download_pool = ThreadPool(
            name="s3-download-pool", maxthreads=threadpool_size
        )
        self._download_pool.start()

        # Manually stop the thread pool on shutdown. If we don't do this then
        # stopping Synapse takes an extra ~30s as Python waits for the threads
        # to exit.
        reactor.addSystemEventTrigger(
            "during", "shutdown", self._download_pool.stop,
        )

    def store_file(self, path, file_info):
        """See StorageProvider.store_file"""

        def _store_file():
            session = boto3.session.Session()
            session.resource("s3", **self.api_kwargs).Bucket(self.bucket).upload_file(
                Filename=os.path.join(self.cache_directory, path),
                Key=path,
                ExtraArgs={"StorageClass": self.storage_class},
            )

        # XXX: reactor.callInThread doesn't return anything, so I don't think this does
        # what the author intended.
        return make_deferred_yieldable(reactor.callInThread(_store_file))

    def fetch(self, path, file_info):
        """See StorageProvider.fetch"""
        logcontext = current_context()

        d = defer.Deferred()
        self._download_pool.callInThread(
            s3_download_task, self.bucket, self.api_kwargs, path, d, logcontext
        )
        return make_deferred_yieldable(d)

    @staticmethod
    def parse_config(config):
        """Called on startup to parse config supplied. This should parse
        the config and raise if there is a problem.

        The returned value is passed into the constructor.

        In this case we return a dict with fields, `bucket` and `storage_class`
        """
        bucket = config["bucket"]
        storage_class = config.get("storage_class", "STANDARD")

        assert isinstance(bucket, string_types)
        assert storage_class in _VALID_STORAGE_CLASSES

        result = {
            "bucket": bucket,
            "storage_class": storage_class,
        }

        if "region_name" in config:
            result["region_name"] = config["region_name"]

        if "endpoint_url" in config:
            result["endpoint_url"] = config["endpoint_url"]

        if "access_key_id" in config:
            result["access_key_id"] = config["access_key_id"]

        if "secret_access_key" in config:
            result["secret_access_key"] = config["secret_access_key"]

        return result


def s3_download_task(bucket, api_kwargs, key, deferred, parent_logcontext):
    """Attempts to download a file from S3.

    Args:
        bucket (str): The S3 bucket which may have the file
        api_kwargs (dict): Keyword arguments to pass when invoking the API.
            Generally `endpoint_url`.
        key (str): The key of the file
        deferred (Deferred[_S3Responder|None]): If file exists
            resolved with an _S3Responder instance, if it doesn't
            exist then resolves with None.
        parent_logcontext (LoggingContext): the logcontext to report logs and metrics
            against.
    """
    with LoggingContext(parent_context=parent_logcontext):
        logger.info("Fetching %s from S3", key)

        local_data = threading.local()

        try:
            s3 = local_data.b3_client
        except AttributeError:
            b3_session = boto3.session.Session()
            local_data.b3_client = s3 = b3_session.client("s3", **api_kwargs)

        try:
            resp = s3.get_object(Bucket=bucket, Key=key)
        except botocore.exceptions.ClientError as e:
            if e.response["Error"]["Code"] in ("404", "NoSuchKey",):
                logger.info("Media %s not found in S3", key)
                reactor.callFromThread(deferred.callback, None)
                return

            reactor.callFromThread(deferred.errback, Failure())
            return

        producer = _S3Responder()
        reactor.callFromThread(deferred.callback, producer)
        _stream_to_producer(reactor, producer, resp["Body"], timeout=90.0)


def _stream_to_producer(reactor, producer, body, status=None, timeout=None):
    """Streams a file like object to the producer.

    Correctly handles producer being paused/resumed/stopped.

    Args:
        reactor
        producer (_S3Responder): Producer object to stream results to
        body (file like): The object to read from
        status (_ProducerStatus|None): Used to track whether we're currently
            paused or not. Used for testing
        timeout (float|None): Timeout in seconds to wait for consume to resume
            after being paused
    """

    # Set when we should be producing, cleared when we are paused
    wakeup_event = producer.wakeup_event

    # Set if we should stop producing forever
    stop_event = producer.stop_event

    if not status:
        status = _ProducerStatus()

    try:
        while not stop_event.is_set():
            # We wait for the producer to signal that the consumer wants
            # more data (or we should abort)
            if not wakeup_event.is_set():
                status.set_paused(True)
                ret = wakeup_event.wait(timeout)
                if not ret:
                    raise Exception("Timed out waiting to resume")
                status.set_paused(False)

            # Check if we were woken up so that we abort the download
            if stop_event.is_set():
                return

            chunk = body.read(READ_CHUNK_SIZE)
            if not chunk:
                return

            reactor.callFromThread(producer._write, chunk)

    except Exception:
        reactor.callFromThread(producer._error, Failure())
    finally:
        reactor.callFromThread(producer._finish)
        if body:
            body.close()


class _S3Responder(Responder):
    """A Responder for S3. Created by _S3DownloadThread
    """

    def __init__(self):
        # Triggered by responder when more data has been requested (or
        # stop_event has been triggered)
        self.wakeup_event = threading.Event()
        # Trigered by responder when we should abort the download.
        self.stop_event = threading.Event()

        # The consumer we're registered to
        self.consumer = None

        # The deferred returned by write_to_consumer, which should resolve when
        # all the data has been written (or there has been a fatal error).
        self.deferred = defer.Deferred()

    def write_to_consumer(self, consumer):
        """See Responder.write_to_consumer
        """
        self.consumer = consumer
        # We are a IPushProducer, so we start producing immediately until we
        # get a pauseProducing or stopProducing
        consumer.registerProducer(self, True)
        self.wakeup_event.set()
        return make_deferred_yieldable(self.deferred)

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.stop_event.set()
        self.wakeup_event.set()

    def resumeProducing(self):
        """See IPushProducer.resumeProducing
        """
        # The consumer is asking for more data, signal _S3DownloadThread
        self.wakeup_event.set()

    def pauseProducing(self):
        """See IPushProducer.stopProducing
        """
        self.wakeup_event.clear()

    def stopProducing(self):
        """See IPushProducer.stopProducing
        """
        # The consumer wants no more data ever, signal _S3DownloadThread
        self.stop_event.set()
        self.wakeup_event.set()
        if not self.deferred.called:
            with LoggingContext():
                self.deferred.errback(Exception("Consumer ask to stop producing"))

    def _write(self, chunk):
        """Writes the chunk of data to consumer. Called by _S3DownloadThread.
        """
        if self.consumer and not self.stop_event.is_set():
            self.consumer.write(chunk)

    def _error(self, failure):
        """Called when a fatal error occured while getting data. Called by
        _S3DownloadThread.
        """
        if self.consumer:
            self.consumer.unregisterProducer()
            self.consumer = None

        if not self.deferred.called:
            self.deferred.errback(failure)

    def _finish(self):
        """Called when there is no more data to write. Called by _S3DownloadThread.
        """
        if self.consumer:
            self.consumer.unregisterProducer()
            self.consumer = None

        if not self.deferred.called:
            self.deferred.callback(None)


class _ProducerStatus(object):
    """Used to track whether the s3 download thread is currently paused
    waiting for consumer to resume. Used for testing.
    """

    def __init__(self):
        self.is_paused = threading.Event()
        self.is_paused.clear()

    def wait_until_paused(self, timeout=None):
        is_paused = self.is_paused.wait(timeout)
        if not is_paused:
            raise Exception("Timed out waiting")

    def set_paused(self, paused):
        if paused:
            self.is_paused.set()
        else:
            self.is_paused.clear()
Implement downloading files from S3 2018-02-07 11:34:53 +00:00			`# -- coding: utf-8 --`
			`# Copyright 2018 New Vector Ltd`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

Fix a Py3 issue and package & test it better (#15) 2018-10-23 09:48:22 +00:00			`import logging`
			`import os`
			`import threading`

			`from six import string_types`

			`import boto3`
			`import botocore`
Implement downloading files from S3 2018-02-07 11:34:53 +00:00
			`from twisted.internet import defer, reactor`
			`from twisted.python.failure import Failure`
Use a threadpool for downloading things from S3 (#30) This is a good thing because we need to create a new S3 client for each thread, and creating S3 clients is relatively expensive. 2020-01-27 16:56:44 +00:00			`from twisted.python.threadpool import ThreadPool`
Implement downloading files from S3 2018-02-07 11:34:53 +00:00
Compatibility with changes to the LoggingContext in Synapse (#36) 2020-05-05 11:11:38 +00:00			`from synapse.logging.context import LoggingContext, make_deferred_yieldable`
Implement downloading files from S3 2018-02-07 11:34:53 +00:00			`from synapse.rest.media.v1._base import Responder`
Fix a Py3 issue and package & test it better (#15) 2018-10-23 09:48:22 +00:00			`from synapse.rest.media.v1.storage_provider import StorageProvider`
Compatibility with changes to the LoggingContext in Synapse (#36) 2020-05-05 11:11:38 +00:00
			`# Synapse 1.13.0 moved current_context to a module-level function.`
			`try:`
			`from synapse.logging.context import current_context`
			`except ImportError:`
			`current_context = LoggingContext.current_context`
Implement downloading files from S3 2018-02-07 11:34:53 +00:00
			`logger = logging.getLogger("synapse.s3")`


Add storage class config option 2018-02-13 16:59:28 +00:00			`# The list of valid AWS storage class names`
Black the codebase (#29) black ftw 2020-01-23 11:48:59 +00:00			`_VALID_STORAGE_CLASSES = (`
			`"STANDARD",`
			`"REDUCED_REDUNDANCY",`
			`"STANDARD_IA",`
			`"INTELLIGENT_TIERING",`
			`)`
Add storage class config option 2018-02-13 16:59:28 +00:00
Factor out generic streaming from s3 specific 2018-08-23 10:04:52 +00:00			`# Chunk size to use when reading from s3 connection in bytes`
			`READ_CHUNK_SIZE = 16 * 1024`

Add storage class config option 2018-02-13 16:59:28 +00:00
Implement downloading files from S3 2018-02-07 11:34:53 +00:00			`class S3StorageProviderBackend(StorageProvider):`
			`"""`
			`Args:`
			`hs (HomeServer)`
			config: The config returned by `parse_config`
			`"""`

			`def __init__(self, hs, config):`
			`self.cache_directory = hs.config.media_store_path`
Add storage class config option 2018-02-13 16:59:28 +00:00			`self.bucket = config["bucket"]`
			`self.storage_class = config["storage_class"]`
Generalize for usage in other S3-like services. 2018-08-21 05:02:43 +00:00			`self.api_kwargs = {}`
Allow configuration of region and access keys Adds the option to set more boto3 options: region_name, aws_access_key_id, and aws_secret_access_key. This makes it easier to configure without having to be careful about some CLI tool having the correct configuration. Also allows setting the region name. 2019-03-14 21:51:16 +00:00
			`if "region_name" in config:`
			`self.api_kwargs["region_name"] = config["region_name"]`

Generalize for usage in other S3-like services. 2018-08-21 05:02:43 +00:00			`if "endpoint_url" in config:`
			`self.api_kwargs["endpoint_url"] = config["endpoint_url"]`
Implement downloading files from S3 2018-02-07 11:34:53 +00:00
Allow configuration of region and access keys Adds the option to set more boto3 options: region_name, aws_access_key_id, and aws_secret_access_key. This makes it easier to configure without having to be careful about some CLI tool having the correct configuration. Also allows setting the region name. 2019-03-14 21:51:16 +00:00			`if "access_key_id" in config:`
			`self.api_kwargs["aws_access_key_id"] = config["access_key_id"]`

			`if "secret_access_key" in config:`
			`self.api_kwargs["aws_secret_access_key"] = config["secret_access_key"]`

Use a threadpool for downloading things from S3 (#30) This is a good thing because we need to create a new S3 client for each thread, and creating S3 clients is relatively expensive. 2020-01-27 16:56:44 +00:00			`threadpool_size = config.get("threadpool_size", 40)`
			`self._download_pool = ThreadPool(`
			`name="s3-download-pool", maxthreads=threadpool_size`
			`)`
			`self._download_pool.start()`

Stop ThreadPool on exit If we don't do this then we end up delaying shutdown by ~30s. 2020-10-19 16:23:39 +00:00			`# Manually stop the thread pool on shutdown. If we don't do this then`
			`# stopping Synapse takes an extra ~30s as Python waits for the threads`
			`# to exit.`
			`reactor.addSystemEventTrigger(`
			`"during", "shutdown", self._download_pool.stop,`
			`)`

Implement downloading files from S3 2018-02-07 11:34:53 +00:00			`def store_file(self, path, file_info):`
			`"""See StorageProvider.store_file"""`

Implement uploading of files to s3 2018-02-07 13:19:31 +00:00			`def _store_file():`
Create a separate boto3 session for each file store thread This prevents threading problems, see https://boto3.amazonaws.com/v1/documentation/api/latest/guide/resources.html#multithreading-multiprocessing. 2018-09-13 21:37:21 +00:00			`session = boto3.session.Session()`
Black the codebase (#29) black ftw 2020-01-23 11:48:59 +00:00			`session.resource("s3", **self.api_kwargs).Bucket(self.bucket).upload_file(`
Add storage class config option 2018-02-13 16:59:28 +00:00			`Filename=os.path.join(self.cache_directory, path),`
			`Key=path,`
			`ExtraArgs={"StorageClass": self.storage_class},`
			`)`
Implement uploading of files to s3 2018-02-07 13:19:31 +00:00
Use a threadpool for downloading things from S3 (#30) This is a good thing because we need to create a new S3 client for each thread, and creating S3 clients is relatively expensive. 2020-01-27 16:56:44 +00:00			`# XXX: reactor.callInThread doesn't return anything, so I don't think this does`
			`# what the author intended.`
Black the codebase (#29) black ftw 2020-01-23 11:48:59 +00:00			`return make_deferred_yieldable(reactor.callInThread(_store_file))`
Implement downloading files from S3 2018-02-07 11:34:53 +00:00
			`def fetch(self, path, file_info):`
			`"""See StorageProvider.fetch"""`
Compatibility with changes to the LoggingContext in Synapse (#36) 2020-05-05 11:11:38 +00:00			`logcontext = current_context()`
Use a threadpool for downloading things from S3 (#30) This is a good thing because we need to create a new S3 client for each thread, and creating S3 clients is relatively expensive. 2020-01-27 16:56:44 +00:00
Implement downloading files from S3 2018-02-07 11:34:53 +00:00			`d = defer.Deferred()`
Use a threadpool for downloading things from S3 (#30) This is a good thing because we need to create a new S3 client for each thread, and creating S3 clients is relatively expensive. 2020-01-27 16:56:44 +00:00			`self._download_pool.callInThread(`
			`s3_download_task, self.bucket, self.api_kwargs, path, d, logcontext`
			`)`
Correctly handle logcontexts 2018-02-12 13:49:43 +00:00			`return make_deferred_yieldable(d)`
Implement downloading files from S3 2018-02-07 11:34:53 +00:00
			`@staticmethod`
			`def parse_config(config):`
			`"""Called on startup to parse config supplied. This should parse`
			`the config and raise if there is a problem.`

			`The returned value is passed into the constructor.`

Add storage class config option 2018-02-13 16:59:28 +00:00			In this case we return a dict with fields, `bucket` and `storage_class`
Implement downloading files from S3 2018-02-07 11:34:53 +00:00			`"""`
Add storage class config option 2018-02-13 16:59:28 +00:00			`bucket = config["bucket"]`
			`storage_class = config.get("storage_class", "STANDARD")`

Fix a Py3 issue and package & test it better (#15) 2018-10-23 09:48:22 +00:00			`assert isinstance(bucket, string_types)`
Add storage class config option 2018-02-13 16:59:28 +00:00			`assert storage_class in _VALID_STORAGE_CLASSES`

Generalize for usage in other S3-like services. 2018-08-21 05:02:43 +00:00			`result = {`
Add storage class config option 2018-02-13 16:59:28 +00:00			`"bucket": bucket,`
			`"storage_class": storage_class,`
			`}`
Implement downloading files from S3 2018-02-07 11:34:53 +00:00
Allow configuration of region and access keys Adds the option to set more boto3 options: region_name, aws_access_key_id, and aws_secret_access_key. This makes it easier to configure without having to be careful about some CLI tool having the correct configuration. Also allows setting the region name. 2019-03-14 21:51:16 +00:00			`if "region_name" in config:`
			`result["region_name"] = config["region_name"]`

Generalize for usage in other S3-like services. 2018-08-21 05:02:43 +00:00			`if "endpoint_url" in config:`
			`result["endpoint_url"] = config["endpoint_url"]`

Allow configuration of region and access keys Adds the option to set more boto3 options: region_name, aws_access_key_id, and aws_secret_access_key. This makes it easier to configure without having to be careful about some CLI tool having the correct configuration. Also allows setting the region name. 2019-03-14 21:51:16 +00:00			`if "access_key_id" in config:`
			`result["access_key_id"] = config["access_key_id"]`

			`if "secret_access_key" in config:`
			`result["secret_access_key"] = config["secret_access_key"]`

Generalize for usage in other S3-like services. 2018-08-21 05:02:43 +00:00			`return result`

Implement downloading files from S3 2018-02-07 11:34:53 +00:00
Use a threadpool for downloading things from S3 (#30) This is a good thing because we need to create a new S3 client for each thread, and creating S3 clients is relatively expensive. 2020-01-27 16:56:44 +00:00			`def s3_download_task(bucket, api_kwargs, key, deferred, parent_logcontext):`
Reorder file 2018-02-12 13:50:21 +00:00			`"""Attempts to download a file from S3.`

			`Args:`
			`bucket (str): The S3 bucket which may have the file`
Fix line too long 2019-03-14 21:54:41 +00:00			`api_kwargs (dict): Keyword arguments to pass when invoking the API.`
			Generally `endpoint_url`.
Reorder file 2018-02-12 13:50:21 +00:00			`key (str): The key of the file`
Fix up comment 2018-02-12 13:50:47 +00:00			`deferred (Deferred[_S3Responder\|None]): If file exists`
Reorder file 2018-02-12 13:50:21 +00:00			`resolved with an _S3Responder instance, if it doesn't`
			`exist then resolves with None.`
Use a threadpool for downloading things from S3 (#30) This is a good thing because we need to create a new S3 client for each thread, and creating S3 clients is relatively expensive. 2020-01-27 16:56:44 +00:00			`parent_logcontext (LoggingContext): the logcontext to report logs and metrics`
			`against.`
Reorder file 2018-02-12 13:50:21 +00:00			`"""`
Use a threadpool for downloading things from S3 (#30) This is a good thing because we need to create a new S3 client for each thread, and creating S3 clients is relatively expensive. 2020-01-27 16:56:44 +00:00			`with LoggingContext(parent_context=parent_logcontext):`
			`logger.info("Fetching %s from S3", key)`
Reorder file 2018-02-12 13:50:21 +00:00
Use boto3 session from localdata. This prevents issues when the thread-unsafe Session() is used by multiple threads by allocating one in local_data. 2019-06-06 08:57:49 +00:00			`local_data = threading.local()`
Reorder file 2018-02-12 13:50:21 +00:00
			`try:`
Use a threadpool for downloading things from S3 (#30) This is a good thing because we need to create a new S3 client for each thread, and creating S3 clients is relatively expensive. 2020-01-27 16:56:44 +00:00			`s3 = local_data.b3_client`
			`except AttributeError:`
			`b3_session = boto3.session.Session()`
			`local_data.b3_client = s3 = b3_session.client("s3", **api_kwargs)`

			`try:`
			`resp = s3.get_object(Bucket=bucket, Key=key)`
Reorder file 2018-02-12 13:50:21 +00:00			`except botocore.exceptions.ClientError as e:`
Black the codebase (#29) black ftw 2020-01-23 11:48:59 +00:00			`if e.response["Error"]["Code"] in ("404", "NoSuchKey",):`
Use a threadpool for downloading things from S3 (#30) This is a good thing because we need to create a new S3 client for each thread, and creating S3 clients is relatively expensive. 2020-01-27 16:56:44 +00:00			`logger.info("Media %s not found in S3", key)`
			`reactor.callFromThread(deferred.callback, None)`
Reorder file 2018-02-12 13:50:21 +00:00			`return`

Use a threadpool for downloading things from S3 (#30) This is a good thing because we need to create a new S3 client for each thread, and creating S3 clients is relatively expensive. 2020-01-27 16:56:44 +00:00			`reactor.callFromThread(deferred.errback, Failure())`
Reorder file 2018-02-12 13:50:21 +00:00			`return`

Factor out generic streaming from s3 specific 2018-08-23 10:04:52 +00:00			`producer = _S3Responder()`
Use a threadpool for downloading things from S3 (#30) This is a good thing because we need to create a new S3 client for each thread, and creating S3 clients is relatively expensive. 2020-01-27 16:56:44 +00:00			`reactor.callFromThread(deferred.callback, producer)`
Black the codebase (#29) black ftw 2020-01-23 11:48:59 +00:00			`_stream_to_producer(reactor, producer, resp["Body"], timeout=90.0)`
Reorder file 2018-02-12 13:50:21 +00:00

Factor out generic streaming from s3 specific 2018-08-23 10:04:52 +00:00			`def _stream_to_producer(reactor, producer, body, status=None, timeout=None):`
			`"""Streams a file like object to the producer.`
Reorder file 2018-02-12 13:50:21 +00:00
Factor out generic streaming from s3 specific 2018-08-23 10:04:52 +00:00			`Correctly handles producer being paused/resumed/stopped.`
Reorder file 2018-02-12 13:50:21 +00:00
Factor out generic streaming from s3 specific 2018-08-23 10:04:52 +00:00			`Args:`
			`reactor`
			`producer (_S3Responder): Producer object to stream results to`
			`body (file like): The object to read from`
			`status (_ProducerStatus\|None): Used to track whether we're currently`
			`paused or not. Used for testing`
			`timeout (float\|None): Timeout in seconds to wait for consume to resume`
			`after being paused`
			`"""`
Reorder file 2018-02-12 13:50:21 +00:00
Factor out generic streaming from s3 specific 2018-08-23 10:04:52 +00:00			`# Set when we should be producing, cleared when we are paused`
			`wakeup_event = producer.wakeup_event`
Reorder file 2018-02-12 13:50:21 +00:00
Factor out generic streaming from s3 specific 2018-08-23 10:04:52 +00:00			`# Set if we should stop producing forever`
			`stop_event = producer.stop_event`

			`if not status:`
			`status = _ProducerStatus()`

			`try:`
			`while not stop_event.is_set():`
			`# We wait for the producer to signal that the consumer wants`
			`# more data (or we should abort)`
			`if not wakeup_event.is_set():`
			`status.set_paused(True)`
			`ret = wakeup_event.wait(timeout)`
			`if not ret:`
			`raise Exception("Timed out waiting to resume")`
			`status.set_paused(False)`

			`# Check if we were woken up so that we abort the download`
			`if stop_event.is_set():`
			`return`

			`chunk = body.read(READ_CHUNK_SIZE)`
			`if not chunk:`
			`return`

			`reactor.callFromThread(producer._write, chunk)`

			`except Exception:`
			`reactor.callFromThread(producer._error, Failure())`
			`finally:`
			`reactor.callFromThread(producer._finish)`
			`if body:`
			`body.close()`
Reorder file 2018-02-12 13:50:21 +00:00

Implement downloading files from S3 2018-02-07 11:34:53 +00:00			`class _S3Responder(Responder):`
			`"""A Responder for S3. Created by _S3DownloadThread`
			`"""`
Black the codebase (#29) black ftw 2020-01-23 11:48:59 +00:00
Factor out generic streaming from s3 specific 2018-08-23 10:04:52 +00:00			`def __init__(self):`
			`# Triggered by responder when more data has been requested (or`
			`# stop_event has been triggered)`
			`self.wakeup_event = threading.Event()`
			`# Trigered by responder when we should abort the download.`
			`self.stop_event = threading.Event()`
Implement downloading files from S3 2018-02-07 11:34:53 +00:00
			`# The consumer we're registered to`
			`self.consumer = None`

			`# The deferred returned by write_to_consumer, which should resolve when`
			`# all the data has been written (or there has been a fatal error).`
			`self.deferred = defer.Deferred()`

			`def write_to_consumer(self, consumer):`
			`"""See Responder.write_to_consumer`
			`"""`
			`self.consumer = consumer`
Convert to be a PushProducer 2018-08-23 10:01:36 +00:00			`# We are a IPushProducer, so we start producing immediately until we`
			`# get a pauseProducing or stopProducing`
			`consumer.registerProducer(self, True)`
			`self.wakeup_event.set()`
Use a threadpool for downloading things from S3 (#30) This is a good thing because we need to create a new S3 client for each thread, and creating S3 clients is relatively expensive. 2020-01-27 16:56:44 +00:00			`return make_deferred_yieldable(self.deferred)`
Implement downloading files from S3 2018-02-07 11:34:53 +00:00
			`def __exit__(self, exc_type, exc_val, exc_tb):`
			`self.stop_event.set()`
			`self.wakeup_event.set()`

			`def resumeProducing(self):`
Convert to be a PushProducer 2018-08-23 10:01:36 +00:00			`"""See IPushProducer.resumeProducing`
Implement downloading files from S3 2018-02-07 11:34:53 +00:00			`"""`
			`# The consumer is asking for more data, signal _S3DownloadThread`
			`self.wakeup_event.set()`

Convert to be a PushProducer 2018-08-23 10:01:36 +00:00			`def pauseProducing(self):`
			`"""See IPushProducer.stopProducing`
			`"""`
			`self.wakeup_event.clear()`

Implement downloading files from S3 2018-02-07 11:34:53 +00:00			`def stopProducing(self):`
Convert to be a PushProducer 2018-08-23 10:01:36 +00:00			`"""See IPushProducer.stopProducing`
Implement downloading files from S3 2018-02-07 11:34:53 +00:00			`"""`
			`# The consumer wants no more data ever, signal _S3DownloadThread`
			`self.stop_event.set()`
			`self.wakeup_event.set()`
Factor out generic streaming from s3 specific 2018-08-23 10:04:52 +00:00			`if not self.deferred.called:`
Use a threadpool for downloading things from S3 (#30) This is a good thing because we need to create a new S3 client for each thread, and creating S3 clients is relatively expensive. 2020-01-27 16:56:44 +00:00			`with LoggingContext():`
			`self.deferred.errback(Exception("Consumer ask to stop producing"))`
Implement downloading files from S3 2018-02-07 11:34:53 +00:00
			`def _write(self, chunk):`
			`"""Writes the chunk of data to consumer. Called by _S3DownloadThread.`
			`"""`
			`if self.consumer and not self.stop_event.is_set():`
			`self.consumer.write(chunk)`

			`def _error(self, failure):`
			`"""Called when a fatal error occured while getting data. Called by`
			`_S3DownloadThread.`
			`"""`
			`if self.consumer:`
			`self.consumer.unregisterProducer()`
			`self.consumer = None`

			`if not self.deferred.called:`
			`self.deferred.errback(failure)`

			`def _finish(self):`
			`"""Called when there is no more data to write. Called by _S3DownloadThread.`
			`"""`
			`if self.consumer:`
			`self.consumer.unregisterProducer()`
			`self.consumer = None`

			`if not self.deferred.called:`
			`self.deferred.callback(None)`
Factor out generic streaming from s3 specific 2018-08-23 10:04:52 +00:00

			`class _ProducerStatus(object):`
			`"""Used to track whether the s3 download thread is currently paused`
			`waiting for consumer to resume. Used for testing.`
			`"""`

			`def __init__(self):`
			`self.is_paused = threading.Event()`
			`self.is_paused.clear()`

			`def wait_until_paused(self, timeout=None):`
			`is_paused = self.is_paused.wait(timeout)`
			`if not is_paused:`
			`raise Exception("Timed out waiting")`

			`def set_paused(self, paused):`
			`if paused:`
			`self.is_paused.set()`
			`else:`
			`self.is_paused.clear()`