openzim · benoit74 · Nov 11, 2024 · Nov 5, 2024 · Nov 7, 2024 · Nov 7, 2024
diff --git a/scraper/pyproject.toml b/scraper/pyproject.toml
@@ -11,7 +11,7 @@ dependencies = [
   "yt-dlp",                      # youtube-dl should be updated as frequently as possible
   "jinja2==3.1.4",
   # use zimscraperlib pinned version once content rewriting functions have been released
-  "zimscraperlib @ git+https://github.com/openzim/python-scraperlib@main",
+  "zimscraperlib @ git+https://github.com/openzim/python-scraperlib@small_changes",
   "requests==2.32.3",
   "types-requests==2.32.0.20240914",
   "kiwixstorage==0.9.0",
@@ -22,6 +22,9 @@ dependencies = [
   "types-beautifulsoup4==4.12.0.20240907",
   "lxml==5.3.0",
   "tinycss2==1.3.0",
+  "pif==0.8.2",
+  "backoff==2.2.1",
+  "joblib==1.4.2",
 ]
 dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
 

diff --git a/scraper/src/mindtouch2zim/asset.py b/scraper/src/mindtouch2zim/asset.py
@@ -0,0 +1,247 @@
+from io import BytesIO
+from typing import NamedTuple
+
+import backoff
+from kiwixstorage import KiwixStorage, NotFoundError
+from pif import get_public_ip
+from PIL import Image
+from requests import HTTPError
+from requests.exceptions import RequestException
+from zimscraperlib.download import stream_file
+from zimscraperlib.image.optimization import optimize_webp
+from zimscraperlib.image.presets import WebpMedium
+from zimscraperlib.rewriting.url_rewriting import HttpUrl, ZimPath
+from zimscraperlib.zim import Creator
+
+from mindtouch2zim.constants import logger, web_session
+from mindtouch2zim.utils import add_item_for, backoff_hdlr
+
+SUPPORTED_IMAGE_MIME_TYPES = {
+    "image/jpeg",
+    "image/png",
+    "image/gif",
+    "image/bmp",
+    "image/tiff",
+    "image/webp",
+    "image/x-portable-pixmap",
+    "image/x-portable-graymap",
+    "image/x-portable-bitmap",
+    "image/x-portable-anymap",
+    "image/vnd.microsoft.icon",
+    "image/vnd.ms-dds",
+    "application/postscript",  # for EPS files
+}
+
+WEBP_OPTIONS = WebpMedium().options
+
+
+class HeaderData(NamedTuple):
+    ident: str  # ~version~ of the URL data to use for comparisons
+    content_type: str | None
+
+
+class AssetDetails(NamedTuple):
+    urls: set[HttpUrl]
+    always_fetch_online: bool
+
+
+class AssetProcessor:
+
+    def __init__(self, s3_url_with_credentials: str | None) -> None:
+        self.s3_url_with_credentials = s3_url_with_credentials
+        self._setup_s3()
+
+    def process_asset(
+        self,
+        asset_path: ZimPath,
+        asset_details: AssetDetails,
+        creator: Creator,
+    ):
+        logger.debug(f"Processing asset for {asset_path}")
+        self._process_asset_internal(
+            asset_path=asset_path, asset_details=asset_details, creator=creator
+        )
+
+    @backoff.on_exception(
+        backoff.expo,
+        RequestException,
+        max_time=16,
+        on_backoff=backoff_hdlr,
+    )
+    def _process_asset_internal(
+        self,
+        asset_path: ZimPath,
+        asset_details: AssetDetails,
+        creator: Creator,
+    ):
+        for asset_url in asset_details.urls:
+            try:
+                asset_content = self.get_asset_content(
+                    asset_path=asset_path,
+                    asset_url=asset_url,
+                    always_fetch_online=asset_details.always_fetch_online,
+                )
+                logger.debug(
+                    f"Adding {asset_url.value} to {asset_path.value} in the ZIM"
+                )
+                add_item_for(
+                    creator=creator,
+                    path="content/" + asset_path.value,
+                    content=asset_content.getvalue(),
+                )
+                break  # file found and added
+            except HTTPError as exc:
+                # would make more sense to be a warning, but this is just too
+                # verbose, at least on geo.libretexts.org many assets are just
+                # missing
+                logger.debug(f"Ignoring {asset_path.value} due to {exc}")
+
+    def _get_header_data_for(self, url: HttpUrl) -> HeaderData:
+        """Get details from headers for a given url
+
+        - get response headers with GET and streaming (retrieveing only 1 byte)
+          - we do not HEAD because it is not possible to follow redirects directly
+            with a HEAD request, and this method is not always implemented / might lie
+        - extract HeaderData from these response headers and return it
+        """
+        _, headers = stream_file(
+            url=url.value,
+            byte_stream=BytesIO(),
+            block_size=1,
+            only_first_block=True,
+        )
+
+        content_type = headers.get("Content-Type", None)
+
+        for header in ("ETag", "Last-Modified", "Content-Length"):
+            if header := headers.get(header):
+                return HeaderData(ident=header, content_type=content_type)
+
+        return HeaderData(ident="-1", content_type=content_type)
+
+    def _get_image_content(
+        self, asset_path: ZimPath, asset_url: HttpUrl, header_data: HeaderData
+    ) -> BytesIO:
+        """Get image content for a given url
+
+        - download from S3 cache if configured and available
+        - otherwise:
+        - download from online
+        - convert to webp
+        - optimize webp
+        - upload to S3 cache if configured
+        """
+        meta = {"ident": header_data.ident, "version": str(WebpMedium.VERSION) + ".r"}
+        s3_key = f"medium/{asset_path.value}"
+
+        if self.s3_url_with_credentials:
+            if s3_data := self._download_from_s3_cache(s3_key=s3_key, meta=meta):
+                logger.debug("Fetching directly from S3 cache")
+                return s3_data  # found in cache
+
+        logger.debug("Fetching from online")
+        unoptimized = self._download_from_online(asset_url=asset_url)
+
+        logger.debug("Optimizing")
+        optimized = BytesIO()
+        with Image.open(unoptimized) as img:
+            img.save(optimized, format="WEBP")
+            del unoptimized
+
+        optimize_webp(
+            src=optimized,
+            quality=WEBP_OPTIONS.get("quality"),  # pyright: ignore[reportArgumentType]
+            method=WEBP_OPTIONS.get("method"),  # pyright: ignore[reportArgumentType]
+            lossless=WEBP_OPTIONS.get(
+                "lossless"
+            ),  # pyright: ignore[reportArgumentType]
+        )
+
+        if self.s3_url_with_credentials:
+            # upload optimized to S3
+            logger.debug("Uploading to S3")
+            self._upload_to_s3_cache(
+                s3_key=s3_key, meta=meta, asset_content=BytesIO(optimized.getvalue())
+            )
+
+        return optimized
+
+    def _download_from_s3_cache(
+        self, s3_key: str, meta: dict[str, str]
+    ) -> BytesIO | None:
+        if not self.s3_storage:
+            raise Exception("s3 storage must be set")
+        try:
+            asset_content = BytesIO()
+            self.s3_storage.download_matching_fileobj(  # pyright: ignore[reportUnknownMemberType]
+                s3_key, asset_content, meta=meta
+            )
+            return asset_content
+        except NotFoundError:
+            return None
+        except Exception as exc:
+            raise Exception(f"Failed to download {s3_key} from S3 cache") from exc
+
+    def _upload_to_s3_cache(
+        self, s3_key: str, meta: dict[str, str], asset_content: BytesIO
+    ):
+        if not self.s3_storage:
+            raise Exception("s3 storage must be set")
+        try:
+            self.s3_storage.upload_fileobj(  # pyright: ignore[reportUnknownMemberType]
+                key=s3_key, fileobj=asset_content, meta=meta
+            )
+        except Exception as exc:
+            raise Exception(f"Failed to upload {s3_key} to S3 cache") from exc
+
+    def _download_from_online(self, asset_url: HttpUrl) -> BytesIO:
+        """Download whole content from online server with retry from scraperlib"""
+
+        asset_content = BytesIO()
+        stream_file(
+            asset_url.value,
+            byte_stream=asset_content,
+            session=web_session,
+        )
+        return asset_content
+
+    def get_asset_content(
+        self, asset_path: ZimPath, asset_url: HttpUrl, *, always_fetch_online: bool
+    ) -> BytesIO:
+        """Download of a given asset, optimize if needed, or download from S3 cache"""
+
+        if not always_fetch_online:
+            header_data = self._get_header_data_for(asset_url)
+            if header_data.content_type:
+                mime_type = header_data.content_type.split(";")[0].strip()
+                if mime_type in SUPPORTED_IMAGE_MIME_TYPES:
+                    return self._get_image_content(
+                        asset_path=asset_path,
+                        asset_url=asset_url,
+                        header_data=header_data,
+                    )
+                else:
+                    logger.debug(f"Not optimizing, unsupported mime type: {mime_type}")
+
+        return self._download_from_online(asset_url=asset_url)
+
+    def _setup_s3(self):
+        if not self.s3_url_with_credentials:
+            return
+        logger.info("testing S3 Optimization Cache credentials")
+        self.s3_storage = KiwixStorage(self.s3_url_with_credentials)
+        if not self.s3_storage.check_credentials(  # pyright: ignore[reportUnknownMemberType]
+            list_buckets=True, bucket=True, write=True, read=True, failsafe=True
+        ):
+            logger.error("S3 cache connection error testing permissions.")
+            logger.error(
+                f"  Server: {self.s3_storage.url.netloc}"  # pyright: ignore[reportUnknownMemberType]
+            )
+            logger.error(
+                f"  Bucket: {self.s3_storage.bucket_name}"  # pyright: ignore[reportUnknownMemberType]
+            )
+            logger.error(
+                f"  Key ID: {self.s3_storage.params.get('keyid')}"  # pyright: ignore[reportUnknownMemberType]
+            )
+            logger.error(f"  Public IP: {get_public_ip()}")
+            raise Exception("Invalid S3 credentials")
diff --git a/scraper/src/mindtouch2zim/constants.py b/scraper/src/mindtouch2zim/constants.py
@@ -2,9 +2,7 @@
 import pathlib
 
 from zimscraperlib.download import get_session
-from zimscraperlib.logging import (
-    getLogger,
-)
+from zimscraperlib.logging import DEFAULT_FORMAT_WITH_THREADS, getLogger
 
 from mindtouch2zim.__about__ import __version__
 
@@ -18,6 +16,6 @@
 HTTP_TIMEOUT_NORMAL_SECONDS = 15
 HTTP_TIMEOUT_LONG_SECONDS = 30
 
-logger = getLogger(NAME, level=logging.DEBUG)
+logger = getLogger(NAME, level=logging.DEBUG, log_format=DEFAULT_FORMAT_WITH_THREADS)
 
 web_session = get_session()
diff --git a/scraper/src/mindtouch2zim/entrypoint.py b/scraper/src/mindtouch2zim/entrypoint.py
@@ -8,7 +8,7 @@
     MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH,
     RECOMMENDED_MAX_TITLE_LENGTH,
 )
-from zimscraperlib.zim.filesystem import validate_zimfile_creatable
+from zimscraperlib.zim.filesystem import validate_folder_writable
 
 from mindtouch2zim.client import MindtouchClient
 from mindtouch2zim.constants import (
@@ -218,17 +218,31 @@ def main(tmpdir: str) -> None:
         dest="illustration_url",
     )
 
+    parser.add_argument(
+        "--optimization-cache",
+        help="URL with credentials to S3 for using as optimization cache",
+        dest="s3_url_with_credentials",
+    )
+
+    parser.add_argument(
+        "--assets-workers",
+        type=int,
+        help=("Number of parallel workers for asset processing (default: 10)"),
+        default=10,
+        dest="assets_workers",
+    )
+
     args = parser.parse_args()
 
     logger.setLevel(level=logging.DEBUG if args.debug else logging.INFO)
 
     output_folder = Path(args.output_folder)
     output_folder.mkdir(exist_ok=True)
-    validate_zimfile_creatable(output_folder, "test.txt")
+    validate_folder_writable(output_folder)
 
     tmp_folder = Path(args.tmp_folder)
     tmp_folder.mkdir(exist_ok=True)
-    validate_zimfile_creatable(tmp_folder, "test.txt")
+    validate_folder_writable(tmp_folder)
 
     library_url = str(args.library_url).rstrip("/")
 
@@ -253,6 +267,8 @@ def main(tmpdir: str) -> None:
             stats_file=Path(args.stats_filename) if args.stats_filename else None,
             overwrite_existing_zim=args.overwrite,
             illustration_url=args.illustration_url,
+            s3_url_with_credentials=args.s3_url_with_credentials,
+            assets_workers=args.assets_workers,
         ).run()
     except SystemExit:
         logger.error("Generation failed, exiting")