From 05f4948677f566e8a0c60334d46461b94dc25940 Mon Sep 17 00:00:00 2001 From: Savin Mikhail Date: Tue, 30 Sep 2025 08:55:22 +0300 Subject: [PATCH 1/8] main --- src/rcdtool/main.py | 46 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/src/rcdtool/main.py b/src/rcdtool/main.py index 5a80cec..4727fa9 100644 --- a/src/rcdtool/main.py +++ b/src/rcdtool/main.py @@ -29,6 +29,7 @@ import asyncio from dataclasses import dataclass from typing import Optional, Coroutine, cast +from urllib.parse import urlparse import argparse from rcdtool.rcdtool import RCD @@ -155,7 +156,7 @@ def main(): rcd_tool = RCD(args.config_filename, dry_mode=args.dry_mode) - raw_targets: list[tuple[int|str, int]] = [] + raw_targets: list[tuple[int|str, int, Optional[int]]] = [] if args.link is None: channel_id = args.channel_id or input('Channel ID: ') @@ -168,7 +169,7 @@ def main(): for (start, end) in range_message_id: logger.debug('range(%s, %s)', start, end) for current_message_id in range(start, end + 1): - raw_targets.append((channel_id, current_message_id)) + raw_targets.append((channel_id, current_message_id, utils.parse_message_id(args.discussion_message_id) if args.discussion_message_id is not None else None)) logger.debug('target: %s', raw_targets) else: links: list[str] = [] @@ -181,15 +182,42 @@ def main(): for link in links: logger.debug('current link: %s', link) - channel_id, message_id = link.split('/')[-2:] - logger.debug('current message_id options: %s', message_id) - message_id_list: list[str] = [] - range_message_id = utils.parse_ranges(message_id) + channel_id_segment: Optional[str] = None + message_segment: Optional[str] = None + discussion_segment: Optional[str] = None + + try: + parsed = urlparse(link) + path = parsed.path or '' + parts = [p for p in path.split('/') if p] + if 'c' in parts: + c_index = parts.index('c') + tail = parts[c_index + 1:] + if len(tail) >= 1: + channel_id_segment = tail[0] + if len(tail) >= 2: + message_segment = tail[1] + if len(tail) >= 3: + discussion_segment = tail[2] + # Fallback to old behavior if parsing failed + if channel_id_segment is None or message_segment is None: + channel_id_segment, message_segment = link.split('/')[-2:] + except Exception: # pragma: no cover - safe fallback + channel_id_segment, message_segment = link.split('/')[-2:] + + logger.debug('current message_id options: %s', message_segment) + range_message_id = utils.parse_ranges(message_segment) for (start, end) in range_message_id: logger.debug('range(%s, %s)', start, end) for current_message_id in range(start, end + 1): - raw_targets.append((channel_id, current_message_id)) + raw_targets.append(( + channel_id_segment, + current_message_id, + utils.parse_message_id(discussion_segment) if discussion_segment is not None else ( + utils.parse_message_id(args.discussion_message_id) if args.discussion_message_id is not None else None + ) + )) logger.debug('target: %s', raw_targets) output_filename: Optional[str] = args.output_filename @@ -197,7 +225,7 @@ def main(): coros: list[Coroutine[None, None, Optional[str]]] = [] exclude_names: list[str] = [] - for channel_id, message_id in raw_targets: + for channel_id, message_id, target_discussion_message_id in raw_targets: updated_channel_id = utils.parse_channel_id(channel_id) updated_message_id = utils.parse_message_id(message_id) logger.debug('downloading from %s:%s', channel_id,message_id) @@ -219,7 +247,7 @@ def main(): message_id=updated_message_id, output_filename=final_output_filename, infer_extension=args.infer_extension, - discussion_message_id=utils.parse_message_id(args.discussion_message_id) if args.discussion_message_id is not None else None, + discussion_message_id=target_discussion_message_id, ) coros.append(coro) From ac2292abd4f072c9848ee355bd5aadf581ac2d3e Mon Sep 17 00:00:00 2001 From: Savin Mikhail Date: Tue, 30 Sep 2025 08:55:30 +0300 Subject: [PATCH 2/8] Revert "main" This reverts commit 05f4948677f566e8a0c60334d46461b94dc25940. --- src/rcdtool/main.py | 46 +++++++++------------------------------------ 1 file changed, 9 insertions(+), 37 deletions(-) diff --git a/src/rcdtool/main.py b/src/rcdtool/main.py index 4727fa9..5a80cec 100644 --- a/src/rcdtool/main.py +++ b/src/rcdtool/main.py @@ -29,7 +29,6 @@ import asyncio from dataclasses import dataclass from typing import Optional, Coroutine, cast -from urllib.parse import urlparse import argparse from rcdtool.rcdtool import RCD @@ -156,7 +155,7 @@ def main(): rcd_tool = RCD(args.config_filename, dry_mode=args.dry_mode) - raw_targets: list[tuple[int|str, int, Optional[int]]] = [] + raw_targets: list[tuple[int|str, int]] = [] if args.link is None: channel_id = args.channel_id or input('Channel ID: ') @@ -169,7 +168,7 @@ def main(): for (start, end) in range_message_id: logger.debug('range(%s, %s)', start, end) for current_message_id in range(start, end + 1): - raw_targets.append((channel_id, current_message_id, utils.parse_message_id(args.discussion_message_id) if args.discussion_message_id is not None else None)) + raw_targets.append((channel_id, current_message_id)) logger.debug('target: %s', raw_targets) else: links: list[str] = [] @@ -182,42 +181,15 @@ def main(): for link in links: logger.debug('current link: %s', link) - channel_id_segment: Optional[str] = None - message_segment: Optional[str] = None - discussion_segment: Optional[str] = None - - try: - parsed = urlparse(link) - path = parsed.path or '' - parts = [p for p in path.split('/') if p] - if 'c' in parts: - c_index = parts.index('c') - tail = parts[c_index + 1:] - if len(tail) >= 1: - channel_id_segment = tail[0] - if len(tail) >= 2: - message_segment = tail[1] - if len(tail) >= 3: - discussion_segment = tail[2] - # Fallback to old behavior if parsing failed - if channel_id_segment is None or message_segment is None: - channel_id_segment, message_segment = link.split('/')[-2:] - except Exception: # pragma: no cover - safe fallback - channel_id_segment, message_segment = link.split('/')[-2:] - - logger.debug('current message_id options: %s', message_segment) - range_message_id = utils.parse_ranges(message_segment) + channel_id, message_id = link.split('/')[-2:] + logger.debug('current message_id options: %s', message_id) + message_id_list: list[str] = [] + range_message_id = utils.parse_ranges(message_id) for (start, end) in range_message_id: logger.debug('range(%s, %s)', start, end) for current_message_id in range(start, end + 1): - raw_targets.append(( - channel_id_segment, - current_message_id, - utils.parse_message_id(discussion_segment) if discussion_segment is not None else ( - utils.parse_message_id(args.discussion_message_id) if args.discussion_message_id is not None else None - ) - )) + raw_targets.append((channel_id, current_message_id)) logger.debug('target: %s', raw_targets) output_filename: Optional[str] = args.output_filename @@ -225,7 +197,7 @@ def main(): coros: list[Coroutine[None, None, Optional[str]]] = [] exclude_names: list[str] = [] - for channel_id, message_id, target_discussion_message_id in raw_targets: + for channel_id, message_id in raw_targets: updated_channel_id = utils.parse_channel_id(channel_id) updated_message_id = utils.parse_message_id(message_id) logger.debug('downloading from %s:%s', channel_id,message_id) @@ -247,7 +219,7 @@ def main(): message_id=updated_message_id, output_filename=final_output_filename, infer_extension=args.infer_extension, - discussion_message_id=target_discussion_message_id, + discussion_message_id=utils.parse_message_id(args.discussion_message_id) if args.discussion_message_id is not None else None, ) coros.append(coro) From 18db7caf69f391b140ec5f54f514cf4adcf29cf3 Mon Sep 17 00:00:00 2001 From: Savin Mikhail Date: Tue, 30 Sep 2025 09:35:25 +0300 Subject: [PATCH 3/8] add docker and makefile --- .dockerignore | 14 ++++++++++++++ .gitignore | 7 +++++++ Dockerfile | 27 +++++++++++++++++++++++++++ Makefile | 32 ++++++++++++++++++++++++++++++++ README.md | 23 +++++++++++++++++++++++ 5 files changed, 103 insertions(+) create mode 100644 .dockerignore create mode 100644 Dockerfile create mode 100644 Makefile diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..d8971ac --- /dev/null +++ b/.dockerignore @@ -0,0 +1,14 @@ +# Ignore VCS and editor cruft +.git +.gitignore + +# Python build/test caches +__pycache__ +*.pyc +.pytest_cache +build + +# Local configs and sessions (mounted at runtime instead) +config.ini +*.session + diff --git a/.gitignore b/.gitignore index e6a0a0f..b279d0a 100644 --- a/.gitignore +++ b/.gitignore @@ -161,3 +161,10 @@ cython_debug/ config.ini *.session + +*.mp4 +*.mp3 +*.jpeg +*.jpg +*.png +*.pdf diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..cc1e846 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,27 @@ +FROM python:3.11-slim + +# Prevent Python from writing .pyc files and enable unbuffered logs +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 + +# System deps: none required for runtime; keep slim +WORKDIR /app + +# Ensure readline is available for interactive prompts +RUN apt-get update \ + && apt-get install -y --no-install-recommends libreadline8 \ + && rm -rf /var/lib/apt/lists/* + +# Copy metadata first to leverage Docker layer caching +COPY pyproject.toml README.md /app/ +COPY src /app/src + +# Install the package so the `rcdtool` CLI entrypoint is available +RUN pip install --no-cache-dir . + +# Use a dedicated working directory for user data (config, session, downloads) +WORKDIR /work + +# Default command runs the CLI; pass args after image name +ENTRYPOINT ["rcdtool"] +CMD ["--help"] diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..77c445b --- /dev/null +++ b/Makefile @@ -0,0 +1,32 @@ +SHELL := bash + +# Minimal, simple targets for a long-lived container +IMG ?= rcdtool +NAME ?= rcdtoold +DATA_DIR ?= $(CURDIR)/data +UID := $(shell id -u 2>/dev/null || echo 1000) +GID := $(shell id -g 2>/dev/null || echo 1000) + +.PHONY: build up down ps shell run + +build: + docker build -t $(IMG) . + +up: build + @mkdir -p "$(DATA_DIR)" + @if [ ! -f "$(DATA_DIR)/config.ini" ]; then cp -n config.ini.sample "$(DATA_DIR)/config.ini"; fi + @if docker ps -a --format '{{.Names}}' | grep -qx "$(NAME)"; then \ + docker start "$(NAME)" >/dev/null; \ + else \ + docker run -d --restart unless-stopped --name "$(NAME)" \ + -v "$(DATA_DIR):/work" \ + --user $(UID):$(GID) \ + --entrypoint sh "$(IMG)" -c 'sleep infinity' >/dev/null; \ + fi + @docker ps --filter name="^$(NAME)$$" --format 'Running: {{.Names}} ({{.Status}})' + +down: + @docker rm -f "$(NAME)" >/dev/null 2>&1 || true; echo "+ Removed: $(NAME)" + +shell: up + docker exec -it -w /work "$(NAME)" bash diff --git a/README.md b/README.md index aa19113..146affa 100644 --- a/README.md +++ b/README.md @@ -91,3 +91,26 @@ rcdtool -c config.ini -C qwert -M 34 -O download/base --infer-extension --- If you want to find a media in a comment on a channel post, use `--discussion-message-id` to set the message id of the comment. + +## Docker + +You can run this app inside a docker container, see Makefile + + +### Makefile shortcuts + +Prefer one-liners via `make` (defaults include `--infer-extension` so files get a proper extension): + +- Build and prepare: + +``` +make build +make setup +``` + +- Start a long‑lived container once, then exec commands inside it: + +``` +make up # builds image, prepares data/, runs container as a daemon +make shell # optional: drop into /work inside the container +``` From 439d226620e2f2ceea9525a0ef9a9d66c666428d Mon Sep 17 00:00:00 2001 From: Savin Mikhail Date: Tue, 30 Sep 2025 09:56:22 +0300 Subject: [PATCH 4/8] add app registration advices --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 146affa..88ea929 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,8 @@ api_id: 32767 api_hash: ed855a59bbe4a3360dbf7a0538842142 ``` +You might have problems with registering new app, consider following these advices: https://habr.com/ru/articles/923168/ + Then rename `config.ini.sample` to `config.ini`, edit it and save wherever you want. If the file is in the same directory as `rcdtool` and its name is exactly "config.ini", then `rcdtool` will load it automatically. The first time, **rcdtool** will ask you for your phone number, and will start a login process. When this is done, a `.session` file will be created. With this `.session` file, the tool could access to your Telegram account to read messages and download medias. The name of the .session file is set in `config.ini`. From 946241e1a22c2ceeebecdb5d2a67374d25cead26 Mon Sep 17 00:00:00 2001 From: Savin Mikhail Date: Sun, 5 Oct 2025 13:38:06 +0300 Subject: [PATCH 5/8] docker compose, batch download --- Dockerfile | 5 +- Makefile | 30 +++-- compose.yaml | 18 +++ scripts/rcdtool_from_messages.py | 194 +++++++++++++++++++++++++++++++ 4 files changed, 230 insertions(+), 17 deletions(-) create mode 100644 compose.yaml create mode 100644 scripts/rcdtool_from_messages.py diff --git a/Dockerfile b/Dockerfile index cc1e846..18bf262 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,10 @@ RUN apt-get update \ && rm -rf /var/lib/apt/lists/* # Copy metadata first to leverage Docker layer caching -COPY pyproject.toml README.md /app/ +# Project metadata and helper scripts +COPY pyproject.toml README.md scripts /app/ +# Optional samples and inputs +COPY .stuff /app/.stuff COPY src /app/src # Install the package so the `rcdtool` CLI entrypoint is available diff --git a/Makefile b/Makefile index 77c445b..cb80e76 100644 --- a/Makefile +++ b/Makefile @@ -1,32 +1,30 @@ SHELL := bash -# Minimal, simple targets for a long-lived container +# docker compose driven workflow +COMPOSE ?= docker compose +SERVICE ?= rcdtoold IMG ?= rcdtool NAME ?= rcdtoold DATA_DIR ?= $(CURDIR)/data UID := $(shell id -u 2>/dev/null || echo 1000) GID := $(shell id -g 2>/dev/null || echo 1000) -.PHONY: build up down ps shell run +.PHONY: build up up-nc down ps bash logs restart build: - docker build -t $(IMG) . + UID=$(UID) GID=$(GID) $(COMPOSE) build -up: build +up: ## Start long‑lived dev container with volumes @mkdir -p "$(DATA_DIR)" @if [ ! -f "$(DATA_DIR)/config.ini" ]; then cp -n config.ini.sample "$(DATA_DIR)/config.ini"; fi - @if docker ps -a --format '{{.Names}}' | grep -qx "$(NAME)"; then \ - docker start "$(NAME)" >/dev/null; \ - else \ - docker run -d --restart unless-stopped --name "$(NAME)" \ - -v "$(DATA_DIR):/work" \ - --user $(UID):$(GID) \ - --entrypoint sh "$(IMG)" -c 'sleep infinity' >/dev/null; \ - fi - @docker ps --filter name="^$(NAME)$$" --format 'Running: {{.Names}} ({{.Status}})' + UID=$(UID) GID=$(GID) $(COMPOSE) up -d + @$(COMPOSE) ps down: - @docker rm -f "$(NAME)" >/dev/null 2>&1 || true; echo "+ Removed: $(NAME)" + $(COMPOSE) down -shell: up - docker exec -it -w /work "$(NAME)" bash +bash: + $(COMPOSE) exec -w /work $(SERVICE) bash + +restart: + $(COMPOSE) restart $(SERVICE) diff --git a/compose.yaml b/compose.yaml new file mode 100644 index 0000000..5090a07 --- /dev/null +++ b/compose.yaml @@ -0,0 +1,18 @@ +services: + rcdtoold: + container_name: rcdtoold + build: + context: . + image: rcdtool + restart: unless-stopped + working_dir: /work + user: "${UID}:${GID}" + environment: + - PYTHONDONTWRITEBYTECODE=1 + - PYTHONUNBUFFERED=1 + # Helpful for running `python -m rcdtool.main` against mounted source + - PYTHONPATH=/app/src + volumes: + - ./data:/work + - .:/app + entrypoint: ["sh", "-c", "sleep infinity"] diff --git a/scripts/rcdtool_from_messages.py b/scripts/rcdtool_from_messages.py new file mode 100644 index 0000000..a465454 --- /dev/null +++ b/scripts/rcdtool_from_messages.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python3 +""" +Read `.stuff/messages.md` and run rcdtool per line. + +Each line format: ` ; ` + - ``: Telegram message link (e.g., https://t.me/c/123/456) + - ``: Used as the base output filename. + +Usage examples: + python scripts/rcdtool_from_messages.py \ + -f .stuff/messages.md -c config.ini --infer-extension + +Notes: + - Uses `-O ` to set the base filename. + - If `rcdtool` CLI is not on PATH, falls back to `python -m rcdtool.main` with PYTHONPATH=src. + - Sanitizes the description into a safe filename while preserving Unicode letters. +""" + +from __future__ import annotations + +import argparse +import os +import re +import shutil +import subprocess +import sys +from pathlib import Path + + +def sanitize_filename(name: str) -> str: + """Sanitize description into a safe base filename. + + - Keep Unicode letters/digits/underscore/space/.-() + - Replace path separators and other symbols with underscore + - Trim leading/trailing dots, spaces, and dashes + """ + name = (name or "").strip() + if not name: + return "file" + + # Replace path separators and disallowed critical characters + name = name.replace("/", "_").replace("\\", "_") + name = name.replace(":", "_") + + # Keep letters/digits/underscore/space/.-() and Unicode word chars + name = re.sub(r"[^\w\s().\-]+", "_", name, flags=re.UNICODE) + # Collapse multiple underscores or spaces + name = re.sub(r"[\s]+", " ", name, flags=re.UNICODE) + name = re.sub(r"_{2,}", "_", name) + + # Trim problematic leading/trailing chars + name = name.strip(" .-_") + return name or "file" + + +def resolve_executor() -> list[str] | None: + """Return the base command list to execute rcdtool. + + Prefer the `rcdtool` console script; if missing, use + `python -m rcdtool.main` with PYTHONPATH=src (handled when running). + """ + exe = shutil.which("rcdtool") + if exe: + return [exe] + # Fallback to python -m rcdtool.main + return [sys.executable, "-m", "rcdtool.main"] + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run rcdtool for each link in a file") + parser.add_argument( + "-f", "--file", + default=".stuff/messages.md", + help="Path to input file (default: .stuff/messages.md)", + ) + parser.add_argument( + "-c", "--config", + default="config.ini", + help="Path to rcdtool config.ini (default: config.ini)", + ) + parser.add_argument( + "--infer-extension", + action="store_true", + help="Pass --infer-extension to rcdtool", + ) + parser.add_argument( + "--detailed-name", + action="store_true", + help="Pass --detailed-name to rcdtool (adds channel/message to name)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print the commands without executing", + ) + args = parser.parse_args() + + in_path = Path(args.file) + if not in_path.exists(): + print(f"Input file not found: {in_path}", file=sys.stderr) + return 2 + + config_path = Path(args.config) + if not config_path.exists(): + print(f"Warning: config not found: {config_path}", file=sys.stderr) + + base_cmd = resolve_executor() + if base_cmd is None: + print("Unable to resolve rcdtool executor", file=sys.stderr) + return 2 + + # Ensure PYTHONPATH includes src for the fallback case + env = os.environ.copy() + if base_cmd[:3] == [sys.executable, "-m", "rcdtool.main"]: + src_path = str((Path.cwd() / "src").resolve()) + env["PYTHONPATH"] = f"{src_path}:{env.get('PYTHONPATH', '')}" if env.get("PYTHONPATH") else src_path + + # Process each non-empty, non-comment line + with in_path.open("r", encoding="utf-8") as fh: + for ln_num, raw in enumerate(fh, start=1): + line = raw.strip() + if not line or line.startswith("#"): + continue + + if ";" in line: + link, desc = line.split(";", 1) + link = link.strip() + desc = desc.strip() + else: + link = line + desc = "" + + if not link: + print(f"Skip line {ln_num}: missing link", file=sys.stderr) + continue + + out_base = sanitize_filename(desc) if desc else "file" + + # Detect /c/// or /c// + # We must skip the middle "topic" id if present and use the last part as message id. + cmd: list[str] + chan_msg = None # tuple[channel_id, message_id] + try: + if "/c/" in link: + after_c = link.split("/c/", 1)[1] + # Drop query/fragment if present + after_c = after_c.split("?", 1)[0].split("#", 1)[0] + parts = [p for p in after_c.split("/") if p] + + def _is_numlike(s: str) -> bool: + s2 = s.lstrip("+-") + return s2.isdigit() + + if len(parts) >= 2 and _is_numlike(parts[0]): + channel_id = parts[0] + # If triple or more segments, last segment is the message id; skip middle(s) + message_id = parts[-1] if _is_numlike(parts[-1]) else None + if len(parts) == 2: + # /c// + message_id = parts[1] if _is_numlike(parts[1]) else None + if message_id is not None: + chan_msg = (channel_id, message_id) + except Exception: + chan_msg = None + + if chan_msg: + channel_id, message_id = chan_msg + cmd = [*base_cmd, "-c", str(config_path), "-C", channel_id, "-M", message_id, "-O", out_base] + else: + cmd = [*base_cmd, "-c", str(config_path), "--link", link, "-O", out_base] + if args.infer_extension: + cmd.append("--infer-extension") + if args.detailed_name: + cmd.append("--detailed-name") + + if args.dry_run: + print("DRY:", " ".join(repr(c) if " " in c else c for c in cmd)) + continue + + print(f"Line {ln_num}: {link} -> {out_base}") + try: + # Stream output directly to the console + proc = subprocess.run(cmd, env=env, check=False) + if proc.returncode != 0: + print(f" Error (exit {proc.returncode}) on line {ln_num}", file=sys.stderr) + except FileNotFoundError as e: + print(f" Executor not found: {e}", file=sys.stderr) + return 127 + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From 975097826728d6ffc820bbc2c022e954b395ab26 Mon Sep 17 00:00:00 2001 From: Savin Mikhail Date: Sun, 5 Oct 2025 14:28:41 +0300 Subject: [PATCH 6/8] speed up download --- Dockerfile | 3 ++ Makefile | 19 ++++---- compose.yaml | 2 +- config.ini.sample | 11 ++++- src/rcdtool/main.py | 12 +++++ src/rcdtool/rcdtool.py | 103 +++++++++++++++++++++++++++++++++++------ 6 files changed, 126 insertions(+), 24 deletions(-) diff --git a/Dockerfile b/Dockerfile index 18bf262..7aeee1a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,6 +9,9 @@ WORKDIR /app # Ensure readline is available for interactive prompts RUN apt-get update \ + && apt-get install curl -y \ + && curl -s https://packagecloud.io/install/repositories/ookla/speedtest-cli/script.deb.sh | bash \ + && apt-get install speedtest -y \ && apt-get install -y --no-install-recommends libreadline8 \ && rm -rf /var/lib/apt/lists/* diff --git a/Makefile b/Makefile index cb80e76..c43bfec 100644 --- a/Makefile +++ b/Makefile @@ -1,23 +1,24 @@ SHELL := bash # docker compose driven workflow -COMPOSE ?= docker compose -SERVICE ?= rcdtoold -IMG ?= rcdtool -NAME ?= rcdtoold -DATA_DIR ?= $(CURDIR)/data -UID := $(shell id -u 2>/dev/null || echo 1000) -GID := $(shell id -g 2>/dev/null || echo 1000) +COMPOSE ?= docker compose +SERVICE ?= rcdtoold +IMG ?= rcdtool +NAME ?= rcdtoold +DATA_DIR ?= $(CURDIR)/data +# Avoid readonly UID in zsh; use HOST_* and pass as DOCKER_* to compose +HOST_UID := $(shell id -u 2>/dev/null || echo 1000) +HOST_GID := $(shell id -g 2>/dev/null || echo 1000) .PHONY: build up up-nc down ps bash logs restart build: - UID=$(UID) GID=$(GID) $(COMPOSE) build + DOCKER_UID=$(HOST_UID) DOCKER_GID=$(HOST_GID) $(COMPOSE) build up: ## Start long‑lived dev container with volumes @mkdir -p "$(DATA_DIR)" @if [ ! -f "$(DATA_DIR)/config.ini" ]; then cp -n config.ini.sample "$(DATA_DIR)/config.ini"; fi - UID=$(UID) GID=$(GID) $(COMPOSE) up -d + DOCKER_UID=$(HOST_UID) DOCKER_GID=$(HOST_GID) $(COMPOSE) up -d @$(COMPOSE) ps down: diff --git a/compose.yaml b/compose.yaml index 5090a07..1f0a83b 100644 --- a/compose.yaml +++ b/compose.yaml @@ -6,7 +6,7 @@ services: image: rcdtool restart: unless-stopped working_dir: /work - user: "${UID}:${GID}" + user: "${DOCKER_UID}:${DOCKER_GID}" environment: - PYTHONDONTWRITEBYTECODE=1 - PYTHONUNBUFFERED=1 diff --git a/config.ini.sample b/config.ini.sample index cf040e2..943e76d 100644 --- a/config.ini.sample +++ b/config.ini.sample @@ -8,5 +8,14 @@ hash = [your API hash here] [Client] timeout = 7000 device_model = scriptgram -lang_code = es-ES +lang_code = en-US +; Optional performance tuning +; concurrent download workers (1..8 is sensible) +workers = 4 +; chunk size per request in KiB (128, 256, 512, 1024) +part_size_kb = 512 +; retry behavior for unstable networks +request_retries = 5 +retry_delay = 2 +connection_retries = 5 diff --git a/src/rcdtool/main.py b/src/rcdtool/main.py index 5a80cec..470ac9a 100644 --- a/src/rcdtool/main.py +++ b/src/rcdtool/main.py @@ -107,6 +107,16 @@ def get_args(): action='store_true', default=False, help='Rename the file with the channel and message ids') + parser.add_argument('--workers', + dest='workers', + type=int, + default=None, + help='Concurrent download workers (default from config or 4)') + parser.add_argument('--part-size-kb', + dest='part_size_kb', + type=int, + default=None, + help='Chunk size in KiB for downloads (default from config or 512)') parser.add_argument('--dry-run', dest='dry_mode', action='store_true', @@ -219,6 +229,8 @@ def main(): message_id=updated_message_id, output_filename=final_output_filename, infer_extension=args.infer_extension, + workers=args.workers, + part_size_kb=args.part_size_kb, discussion_message_id=utils.parse_message_id(args.discussion_message_id) if args.discussion_message_id is not None else None, ) coros.append(coro) diff --git a/src/rcdtool/rcdtool.py b/src/rcdtool/rcdtool.py index 3b05074..bb3cac4 100644 --- a/src/rcdtool/rcdtool.py +++ b/src/rcdtool/rcdtool.py @@ -67,13 +67,24 @@ def create_client(self): Returns: TelegramClient: The Telegram client object. """ + # Optional tuning knobs with sensible defaults + timeout = int(self.config['Client'].get('timeout', '7000')) + device_model = self.config['Client'].get('device_model', 'scriptgram') + lang_code = self.config['Client'].get('lang_code', 'en-US') + request_retries = int(self.config['Client'].get('request_retries', '5')) + retry_delay = int(self.config['Client'].get('retry_delay', '2')) + connection_retries = int(self.config['Client'].get('connection_retries', '5')) + client = TelegramClient( session=self.config['Access']['session'], api_id=int(self.config['Access']['id']), api_hash=self.config['Access']['hash'], - timeout=int(self.config['Client']['timeout']), - device_model=self.config['Client']['device_model'], - lang_code=self.config['Client']['lang_code'], + timeout=timeout, + device_model=device_model, + lang_code=lang_code, + request_retries=request_retries, + retry_delay=retry_delay, + connection_retries=connection_retries, ) client.start() return client @@ -83,6 +94,8 @@ async def download_media(self, message_id: int, output_filename: str, infer_extension: Optional[bool] = None, + workers: Optional[int] = None, + part_size_kb: Optional[int] = None, discussion_message_id: Optional[int] = None, ): """Read a message in a channel and download the media to output. @@ -174,17 +187,81 @@ async def download_media(self, logger.warning('No media found') return - with open(output_filename, 'wb+') as file: - if isinstance(media, tg_types.MessageMediaPaidMedia): - logger.debug('paid message found') - for message_extended_media in media.extended_media: - if isinstance(message_extended_media, tg_types.MessageExtendedMedia): - await self.client.download_file(message_extended_media.media, file) + # Resolve defaults from config if not provided + cfg_workers = int(self.config['Client'].get('workers', '4')) + cfg_part_kb = int(self.config['Client'].get('part_size_kb', '512')) + + # Throttle logs: progress callback every ~1s + import time + last_t = 0.0 + last_b = 0 + def _progress(bytes_downloaded: int, total: Optional[int]): + nonlocal last_t, last_b + now = time.time() + if last_t == 0.0: + last_t, last_b = now, bytes_downloaded + return + if now - last_t >= 1.0: + delta_b = bytes_downloaded - last_b + speed = delta_b / (now - last_t) + # human readable speed + units = 'B/s' + val = speed + for u in ['KB/s','MB/s','GB/s']: + if val > 1024: + val /= 1024 + units = u else: - logger.warning('Cannot find a message extended media') - return - else: - await self.client.download_file(media, file) + break + if total: + logger.info('progress: %.1f%% at %.2f %s', bytes_downloaded * 100 / total, val, units) + else: + logger.info('progress: %d bytes at %.2f %s', bytes_downloaded, val, units) + last_t, last_b = now, bytes_downloaded + + # Use low-level download_file for broad Telethon compatibility and control + import inspect + + async def _dl_file(input_media, out_path: str): + sig = None + try: + sig = inspect.signature(self.client.download_file) + except Exception: + sig = None + + kwargs = { + 'file': out_path, + 'part_size_kb': part_size_kb or cfg_part_kb, + 'progress_callback': _progress, + } + # Add workers only if supported in this Telethon version + if sig and 'workers' in sig.parameters: + if workers or cfg_workers: + kwargs['workers'] = workers or cfg_workers + + try: + await self.client.download_file(input_media, **kwargs) + except TypeError: + # Fallback: remove optional kwargs progressively + kwargs.pop('progress_callback', None) + try: + await self.client.download_file(input_media, **kwargs) + except TypeError: + kwargs.pop('part_size_kb', None) + kwargs.pop('workers', None) + await self.client.download_file(input_media, **kwargs) + + if isinstance(media, tg_types.MessageMediaPaidMedia): + logger.debug('paid message found') + for message_extended_media in media.extended_media: + if isinstance(message_extended_media, tg_types.MessageExtendedMedia): + await _dl_file(message_extended_media.media, output_filename) + else: + logger.warning('Cannot find a message extended media') + return + logger.info('downloaded to %s', output_filename) + else: + await _dl_file(media, output_filename) logger.info('downloaded to %s', output_filename) if infer_extension: From e520347ade9e84950b35054503a579c6350bb965 Mon Sep 17 00:00:00 2001 From: Savin Mikhail Date: Sun, 5 Oct 2025 14:36:42 +0300 Subject: [PATCH 7/8] speed up download --- src/rcdtool/rcdtool.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/rcdtool/rcdtool.py b/src/rcdtool/rcdtool.py index bb3cac4..7d57c97 100644 --- a/src/rcdtool/rcdtool.py +++ b/src/rcdtool/rcdtool.py @@ -195,6 +195,16 @@ async def download_media(self, import time last_t = 0.0 last_b = 0 + + def _fmt_bytes(n: int | float) -> tuple[float, str]: + units = ['B', 'KB', 'MB', 'GB', 'TB'] + val = float(n) + idx = 0 + while val >= 1024.0 and idx < len(units) - 1: + val /= 1024.0 + idx += 1 + return val, units[idx] + def _progress(bytes_downloaded: int, total: Optional[int]): nonlocal last_t, last_b now = time.time() @@ -204,19 +214,15 @@ def _progress(bytes_downloaded: int, total: Optional[int]): if now - last_t >= 1.0: delta_b = bytes_downloaded - last_b speed = delta_b / (now - last_t) - # human readable speed - units = 'B/s' - val = speed - for u in ['KB/s','MB/s','GB/s']: - if val > 1024: - val /= 1024 - units = u - else: - break + spd_val, spd_unit = _fmt_bytes(speed) + cur_val, cur_unit = _fmt_bytes(bytes_downloaded) if total: - logger.info('progress: %.1f%% at %.2f %s', bytes_downloaded * 100 / total, val, units) + tot_val, tot_unit = _fmt_bytes(total) + percent = bytes_downloaded * 100 / total + logger.info('progress: %.2f %s/%.2f %s (%.1f%%) at %.2f %s', + cur_val, cur_unit, tot_val, tot_unit, percent, spd_val, spd_unit) else: - logger.info('progress: %d bytes at %.2f %s', bytes_downloaded, val, units) + logger.info('progress: %.2f %s at %.2f %s', cur_val, cur_unit, spd_val, spd_unit) last_t, last_b = now, bytes_downloaded # Use low-level download_file for broad Telethon compatibility and control From f7630c6386a08ff0bf85bd80194268d05bde58cc Mon Sep 17 00:00:00 2001 From: Savin Mikhail Date: Tue, 7 Oct 2025 14:02:05 +0300 Subject: [PATCH 8/8] show progress --- .gitignore | 3 + README.md | 4 + scripts/rcdtool_from_messages.py | 154 +++++++++++++++++++++++-------- src/rcdtool/rcdtool.py | 51 +++++++++- 4 files changed, 174 insertions(+), 38 deletions(-) diff --git a/.gitignore b/.gitignore index b279d0a..6ed211e 100644 --- a/.gitignore +++ b/.gitignore @@ -168,3 +168,6 @@ config.ini *.jpg *.png *.pdf +*.mov +*.webm +*.mkv diff --git a/README.md b/README.md index 88ea929..8a542aa 100644 --- a/README.md +++ b/README.md @@ -116,3 +116,7 @@ make setup make up # builds image, prepares data/, runs container as a daemon make shell # optional: drop into /work inside the container ``` + +```bash +docker exec -it -w /work rcdtoold python3 /app/rcdtool_from_messages.py --infer-extension -f /app/.stuff/messages.md -c /work/config.ini +``` \ No newline at end of file diff --git a/scripts/rcdtool_from_messages.py b/scripts/rcdtool_from_messages.py index a465454..87849dc 100644 --- a/scripts/rcdtool_from_messages.py +++ b/scripts/rcdtool_from_messages.py @@ -3,17 +3,21 @@ Read `.stuff/messages.md` and run rcdtool per line. Each line format: ` ; ` - - ``: Telegram message link (e.g., https://t.me/c/123/456) + - ``: Telegram message link (e.g., https://t.me/c///) - ``: Used as the base output filename. +Two execution modes: + - inproc (default): import and reuse a single RCD client (avoids SQLite session lock issues) + - subprocess: shell out to the `rcdtool` CLI for each line + Usage examples: python scripts/rcdtool_from_messages.py \ - -f .stuff/messages.md -c config.ini --infer-extension + -f .stuff/messages.md -c config.ini --infer-extension --mode inproc Notes: - - Uses `-O ` to set the base filename. - - If `rcdtool` CLI is not on PATH, falls back to `python -m rcdtool.main` with PYTHONPATH=src. - Sanitizes the description into a safe filename while preserving Unicode letters. + - In inproc mode, supports --workers/--part-size-kb to tune performance. + - If `rcdtool` CLI is not on PATH, subprocess mode falls back to `python -m rcdtool.main` with PYTHONPATH=src. """ from __future__ import annotations @@ -26,6 +30,18 @@ import sys from pathlib import Path +# In inproc mode we import the package directly (PYTHONPATH=/app/src is set in compose) +try: + from rcdtool.rcdtool import RCD + import rcdtool.utils as utils + from rcdtool.main import generate_unique_filename + _HAVE_INPROC = True +except Exception: + RCD = None # type: ignore + utils = None # type: ignore + generate_unique_filename = None # type: ignore + _HAVE_INPROC = False + def sanitize_filename(name: str) -> str: """Sanitize description into a safe base filename. @@ -88,6 +104,24 @@ def main() -> int: action="store_true", help="Pass --detailed-name to rcdtool (adds channel/message to name)", ) + parser.add_argument( + "--workers", + type=int, + default=None, + help="Concurrent download workers (inproc mode)", + ) + parser.add_argument( + "--part-size-kb", + type=int, + default=None, + help="Chunk size in KiB (inproc mode)", + ) + parser.add_argument( + "--mode", + choices=["inproc", "subprocess"], + default="inproc", + help="Execution mode (default: inproc)", + ) parser.add_argument( "--dry-run", action="store_true", @@ -104,18 +138,23 @@ def main() -> int: if not config_path.exists(): print(f"Warning: config not found: {config_path}", file=sys.stderr) - base_cmd = resolve_executor() - if base_cmd is None: - print("Unable to resolve rcdtool executor", file=sys.stderr) - return 2 - - # Ensure PYTHONPATH includes src for the fallback case - env = os.environ.copy() - if base_cmd[:3] == [sys.executable, "-m", "rcdtool.main"]: - src_path = str((Path.cwd() / "src").resolve()) - env["PYTHONPATH"] = f"{src_path}:{env.get('PYTHONPATH', '')}" if env.get("PYTHONPATH") else src_path + use_inproc = (args.mode == "inproc") and _HAVE_INPROC + if not use_inproc: + base_cmd = resolve_executor() + if base_cmd is None: + print("Unable to resolve rcdtool executor", file=sys.stderr) + return 2 + # Ensure PYTHONPATH includes src for the fallback case + env = os.environ.copy() + if base_cmd[:3] == [sys.executable, "-m", "rcdtool.main"]: + src_path = str((Path.cwd() / "src").resolve()) + env["PYTHONPATH"] = f"{src_path}:{env.get('PYTHONPATH', '')}" if env.get("PYTHONPATH") else src_path + else: + # Build a single reusable client + rcd_tool = RCD(str(config_path), dry_mode=args.dry_run) # type: ignore[name-defined] # Process each non-empty, non-comment line + exclude_names: list[str] = [] with in_path.open("r", encoding="utf-8") as fh: for ln_num, raw in enumerate(fh, start=1): line = raw.strip() @@ -138,7 +177,6 @@ def main() -> int: # Detect /c/// or /c// # We must skip the middle "topic" id if present and use the last part as message id. - cmd: list[str] chan_msg = None # tuple[channel_id, message_id] try: if "/c/" in link: @@ -163,29 +201,73 @@ def _is_numlike(s: str) -> bool: except Exception: chan_msg = None - if chan_msg: + if use_inproc and chan_msg: channel_id, message_id = chan_msg - cmd = [*base_cmd, "-c", str(config_path), "-C", channel_id, "-M", message_id, "-O", out_base] + final_output = generate_unique_filename( # type: ignore[name-defined] + out_base, + bool(args.detailed_name), + f'-{channel_id}-{message_id}', + exclude_names, + ) + exclude_names.append(final_output) + + if args.dry_run: + print(f"DRY INPROC: {link} -> {final_output}") + continue + + print(f"Line {ln_num}: {link} -> {out_base}") + try: + res = rcd_tool.client.loop.run_until_complete( # type: ignore[attr-defined] + rcd_tool.download_media( + channel_id=utils.parse_channel_id(channel_id), # type: ignore[name-defined] + message_id=utils.parse_message_id(message_id), # type: ignore[name-defined] + output_filename=final_output, + infer_extension=args.infer_extension, + workers=args.workers, + part_size_kb=args.part_size_kb, + ) + ) + if res: + print(res) + except Exception as e: + print(f" Error: {e}", file=sys.stderr) else: - cmd = [*base_cmd, "-c", str(config_path), "--link", link, "-O", out_base] - if args.infer_extension: - cmd.append("--infer-extension") - if args.detailed_name: - cmd.append("--detailed-name") - - if args.dry_run: - print("DRY:", " ".join(repr(c) if " " in c else c for c in cmd)) - continue - - print(f"Line {ln_num}: {link} -> {out_base}") - try: - # Stream output directly to the console - proc = subprocess.run(cmd, env=env, check=False) - if proc.returncode != 0: - print(f" Error (exit {proc.returncode}) on line {ln_num}", file=sys.stderr) - except FileNotFoundError as e: - print(f" Executor not found: {e}", file=sys.stderr) - return 127 + # subprocess mode or non-standard link; fall back to CLI + base_cmd = resolve_executor() + if base_cmd is None: + print("Unable to resolve rcdtool executor", file=sys.stderr) + return 2 + env = os.environ.copy() + if base_cmd[:3] == [sys.executable, "-m", "rcdtool.main"]: + src_path = str((Path.cwd() / "src").resolve()) + env["PYTHONPATH"] = f"{src_path}:{env.get('PYTHONPATH', '')}" if env.get("PYTHONPATH") else src_path + + if chan_msg: + channel_id, message_id = chan_msg + cmd = [*base_cmd, "-c", str(config_path), "-C", channel_id, "-M", message_id, "-O", out_base] + else: + cmd = [*base_cmd, "-c", str(config_path), "--link", link, "-O", out_base] + if args.infer_extension: + cmd.append("--infer-extension") + if args.detailed_name: + cmd.append("--detailed-name") + if args.workers: + cmd += ["--workers", str(args.workers)] + if args.part_size_kb: + cmd += ["--part-size-kb", str(args.part_size_kb)] + + if args.dry_run: + print("DRY:", " ".join(repr(c) if " " in c else c for c in cmd)) + continue + + print(f"Line {ln_num}: {link} -> {out_base}") + try: + proc = subprocess.run(cmd, env=env, check=False) + if proc.returncode != 0: + print(f" Error (exit {proc.returncode}) on line {ln_num}", file=sys.stderr) + except FileNotFoundError as e: + print(f" Executor not found: {e}", file=sys.stderr) + return 127 return 0 diff --git a/src/rcdtool/rcdtool.py b/src/rcdtool/rcdtool.py index 7d57c97..0bad684 100644 --- a/src/rcdtool/rcdtool.py +++ b/src/rcdtool/rcdtool.py @@ -133,7 +133,7 @@ async def download_media(self, logger.warning('Cannot continue because the got type is not a Message') return - logger.info('downloading...') + # defer logging until we finalize the target message if discussion_message_id: logger.info('finding message from a discussion group') @@ -181,7 +181,54 @@ async def download_media(self, if self.dry_mode: return output_filename - + + # Try to compute and log media size (when available) before download + def _fmt_bytes(n: int | float) -> tuple[float, str]: + units = ['B', 'KB', 'MB', 'GB', 'TB'] + val = float(n) + idx = 0 + while val >= 1024.0 and idx < len(units) - 1: + val /= 1024.0 + idx += 1 + return val, units[idx] + + def _get_media_size(m) -> Optional[int]: + try: + if isinstance(m, tg_types.MessageMediaDocument) and isinstance(m.document, tg_types.Document): + return int(getattr(m.document, 'size', 0) or 0) or None + if isinstance(m, tg_types.MessageMediaPhoto) and isinstance(m.photo, tg_types.Photo): + sizes = getattr(m.photo, 'sizes', []) or [] + candidates = [getattr(s, 'size', None) for s in sizes] + candidates = [int(x) for x in candidates if isinstance(x, int)] + return max(candidates) if candidates else None + if isinstance(m, tg_types.MessageMediaPaidMedia): + total = 0 + found = False + for em in m.extended_media: + if isinstance(em, tg_types.MessageExtendedMedia): + inner = em.media + if isinstance(inner, tg_types.Document): + total += int(getattr(inner, 'size', 0) or 0) + found = True + elif isinstance(inner, tg_types.Photo): + sizes = getattr(inner, 'sizes', []) or [] + candidates = [getattr(s, 'size', None) for s in sizes] + candidates = [int(x) for x in candidates if isinstance(x, int)] + if candidates: + total += max(candidates) + found = True + return total if found else None + except Exception: + return None + return None + + pre_media = message.media + pre_size = _get_media_size(pre_media) + if pre_size: + s_val, s_unit = _fmt_bytes(pre_size) + logger.info('size: %.2f %s', s_val, s_unit) + logger.info('downloading...') + media = message.media if media is None: logger.warning('No media found')