diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..d8971ac --- /dev/null +++ b/.dockerignore @@ -0,0 +1,14 @@ +# Ignore VCS and editor cruft +.git +.gitignore + +# Python build/test caches +__pycache__ +*.pyc +.pytest_cache +build + +# Local configs and sessions (mounted at runtime instead) +config.ini +*.session + diff --git a/.gitignore b/.gitignore index e6a0a0f..6ed211e 100644 --- a/.gitignore +++ b/.gitignore @@ -161,3 +161,13 @@ cython_debug/ config.ini *.session + +*.mp4 +*.mp3 +*.jpeg +*.jpg +*.png +*.pdf +*.mov +*.webm +*.mkv diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..7aeee1a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,33 @@ +FROM python:3.11-slim + +# Prevent Python from writing .pyc files and enable unbuffered logs +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 + +# System deps: none required for runtime; keep slim +WORKDIR /app + +# Ensure readline is available for interactive prompts +RUN apt-get update \ + && apt-get install curl -y \ + && curl -s https://packagecloud.io/install/repositories/ookla/speedtest-cli/script.deb.sh | bash \ + && apt-get install speedtest -y \ + && apt-get install -y --no-install-recommends libreadline8 \ + && rm -rf /var/lib/apt/lists/* + +# Copy metadata first to leverage Docker layer caching +# Project metadata and helper scripts +COPY pyproject.toml README.md scripts /app/ +# Optional samples and inputs +COPY .stuff /app/.stuff +COPY src /app/src + +# Install the package so the `rcdtool` CLI entrypoint is available +RUN pip install --no-cache-dir . + +# Use a dedicated working directory for user data (config, session, downloads) +WORKDIR /work + +# Default command runs the CLI; pass args after image name +ENTRYPOINT ["rcdtool"] +CMD ["--help"] diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..c43bfec --- /dev/null +++ b/Makefile @@ -0,0 +1,31 @@ +SHELL := bash + +# docker compose driven workflow +COMPOSE ?= docker compose +SERVICE ?= rcdtoold +IMG ?= rcdtool +NAME ?= rcdtoold +DATA_DIR ?= $(CURDIR)/data +# Avoid readonly UID in zsh; use HOST_* and pass as DOCKER_* to compose +HOST_UID := $(shell id -u 2>/dev/null || echo 1000) +HOST_GID := $(shell id -g 2>/dev/null || echo 1000) + +.PHONY: build up up-nc down ps bash logs restart + +build: + DOCKER_UID=$(HOST_UID) DOCKER_GID=$(HOST_GID) $(COMPOSE) build + +up: ## Start long‑lived dev container with volumes + @mkdir -p "$(DATA_DIR)" + @if [ ! -f "$(DATA_DIR)/config.ini" ]; then cp -n config.ini.sample "$(DATA_DIR)/config.ini"; fi + DOCKER_UID=$(HOST_UID) DOCKER_GID=$(HOST_GID) $(COMPOSE) up -d + @$(COMPOSE) ps + +down: + $(COMPOSE) down + +bash: + $(COMPOSE) exec -w /work $(SERVICE) bash + +restart: + $(COMPOSE) restart $(SERVICE) diff --git a/README.md b/README.md index aa19113..8a542aa 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,8 @@ api_id: 32767 api_hash: ed855a59bbe4a3360dbf7a0538842142 ``` +You might have problems with registering new app, consider following these advices: https://habr.com/ru/articles/923168/ + Then rename `config.ini.sample` to `config.ini`, edit it and save wherever you want. If the file is in the same directory as `rcdtool` and its name is exactly "config.ini", then `rcdtool` will load it automatically. The first time, **rcdtool** will ask you for your phone number, and will start a login process. When this is done, a `.session` file will be created. With this `.session` file, the tool could access to your Telegram account to read messages and download medias. The name of the .session file is set in `config.ini`. @@ -91,3 +93,30 @@ rcdtool -c config.ini -C qwert -M 34 -O download/base --infer-extension --- If you want to find a media in a comment on a channel post, use `--discussion-message-id` to set the message id of the comment. + +## Docker + +You can run this app inside a docker container, see Makefile + + +### Makefile shortcuts + +Prefer one-liners via `make` (defaults include `--infer-extension` so files get a proper extension): + +- Build and prepare: + +``` +make build +make setup +``` + +- Start a long‑lived container once, then exec commands inside it: + +``` +make up # builds image, prepares data/, runs container as a daemon +make shell # optional: drop into /work inside the container +``` + +```bash +docker exec -it -w /work rcdtoold python3 /app/rcdtool_from_messages.py --infer-extension -f /app/.stuff/messages.md -c /work/config.ini +``` \ No newline at end of file diff --git a/compose.yaml b/compose.yaml new file mode 100644 index 0000000..1f0a83b --- /dev/null +++ b/compose.yaml @@ -0,0 +1,18 @@ +services: + rcdtoold: + container_name: rcdtoold + build: + context: . + image: rcdtool + restart: unless-stopped + working_dir: /work + user: "${DOCKER_UID}:${DOCKER_GID}" + environment: + - PYTHONDONTWRITEBYTECODE=1 + - PYTHONUNBUFFERED=1 + # Helpful for running `python -m rcdtool.main` against mounted source + - PYTHONPATH=/app/src + volumes: + - ./data:/work + - .:/app + entrypoint: ["sh", "-c", "sleep infinity"] diff --git a/config.ini.sample b/config.ini.sample index cf040e2..943e76d 100644 --- a/config.ini.sample +++ b/config.ini.sample @@ -8,5 +8,14 @@ hash = [your API hash here] [Client] timeout = 7000 device_model = scriptgram -lang_code = es-ES +lang_code = en-US +; Optional performance tuning +; concurrent download workers (1..8 is sensible) +workers = 4 +; chunk size per request in KiB (128, 256, 512, 1024) +part_size_kb = 512 +; retry behavior for unstable networks +request_retries = 5 +retry_delay = 2 +connection_retries = 5 diff --git a/scripts/rcdtool_from_messages.py b/scripts/rcdtool_from_messages.py new file mode 100644 index 0000000..87849dc --- /dev/null +++ b/scripts/rcdtool_from_messages.py @@ -0,0 +1,276 @@ +#!/usr/bin/env python3 +""" +Read `.stuff/messages.md` and run rcdtool per line. + +Each line format: ` ; ` + - ``: Telegram message link (e.g., https://t.me/c///) + - ``: Used as the base output filename. + +Two execution modes: + - inproc (default): import and reuse a single RCD client (avoids SQLite session lock issues) + - subprocess: shell out to the `rcdtool` CLI for each line + +Usage examples: + python scripts/rcdtool_from_messages.py \ + -f .stuff/messages.md -c config.ini --infer-extension --mode inproc + +Notes: + - Sanitizes the description into a safe filename while preserving Unicode letters. + - In inproc mode, supports --workers/--part-size-kb to tune performance. + - If `rcdtool` CLI is not on PATH, subprocess mode falls back to `python -m rcdtool.main` with PYTHONPATH=src. +""" + +from __future__ import annotations + +import argparse +import os +import re +import shutil +import subprocess +import sys +from pathlib import Path + +# In inproc mode we import the package directly (PYTHONPATH=/app/src is set in compose) +try: + from rcdtool.rcdtool import RCD + import rcdtool.utils as utils + from rcdtool.main import generate_unique_filename + _HAVE_INPROC = True +except Exception: + RCD = None # type: ignore + utils = None # type: ignore + generate_unique_filename = None # type: ignore + _HAVE_INPROC = False + + +def sanitize_filename(name: str) -> str: + """Sanitize description into a safe base filename. + + - Keep Unicode letters/digits/underscore/space/.-() + - Replace path separators and other symbols with underscore + - Trim leading/trailing dots, spaces, and dashes + """ + name = (name or "").strip() + if not name: + return "file" + + # Replace path separators and disallowed critical characters + name = name.replace("/", "_").replace("\\", "_") + name = name.replace(":", "_") + + # Keep letters/digits/underscore/space/.-() and Unicode word chars + name = re.sub(r"[^\w\s().\-]+", "_", name, flags=re.UNICODE) + # Collapse multiple underscores or spaces + name = re.sub(r"[\s]+", " ", name, flags=re.UNICODE) + name = re.sub(r"_{2,}", "_", name) + + # Trim problematic leading/trailing chars + name = name.strip(" .-_") + return name or "file" + + +def resolve_executor() -> list[str] | None: + """Return the base command list to execute rcdtool. + + Prefer the `rcdtool` console script; if missing, use + `python -m rcdtool.main` with PYTHONPATH=src (handled when running). + """ + exe = shutil.which("rcdtool") + if exe: + return [exe] + # Fallback to python -m rcdtool.main + return [sys.executable, "-m", "rcdtool.main"] + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run rcdtool for each link in a file") + parser.add_argument( + "-f", "--file", + default=".stuff/messages.md", + help="Path to input file (default: .stuff/messages.md)", + ) + parser.add_argument( + "-c", "--config", + default="config.ini", + help="Path to rcdtool config.ini (default: config.ini)", + ) + parser.add_argument( + "--infer-extension", + action="store_true", + help="Pass --infer-extension to rcdtool", + ) + parser.add_argument( + "--detailed-name", + action="store_true", + help="Pass --detailed-name to rcdtool (adds channel/message to name)", + ) + parser.add_argument( + "--workers", + type=int, + default=None, + help="Concurrent download workers (inproc mode)", + ) + parser.add_argument( + "--part-size-kb", + type=int, + default=None, + help="Chunk size in KiB (inproc mode)", + ) + parser.add_argument( + "--mode", + choices=["inproc", "subprocess"], + default="inproc", + help="Execution mode (default: inproc)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print the commands without executing", + ) + args = parser.parse_args() + + in_path = Path(args.file) + if not in_path.exists(): + print(f"Input file not found: {in_path}", file=sys.stderr) + return 2 + + config_path = Path(args.config) + if not config_path.exists(): + print(f"Warning: config not found: {config_path}", file=sys.stderr) + + use_inproc = (args.mode == "inproc") and _HAVE_INPROC + if not use_inproc: + base_cmd = resolve_executor() + if base_cmd is None: + print("Unable to resolve rcdtool executor", file=sys.stderr) + return 2 + # Ensure PYTHONPATH includes src for the fallback case + env = os.environ.copy() + if base_cmd[:3] == [sys.executable, "-m", "rcdtool.main"]: + src_path = str((Path.cwd() / "src").resolve()) + env["PYTHONPATH"] = f"{src_path}:{env.get('PYTHONPATH', '')}" if env.get("PYTHONPATH") else src_path + else: + # Build a single reusable client + rcd_tool = RCD(str(config_path), dry_mode=args.dry_run) # type: ignore[name-defined] + + # Process each non-empty, non-comment line + exclude_names: list[str] = [] + with in_path.open("r", encoding="utf-8") as fh: + for ln_num, raw in enumerate(fh, start=1): + line = raw.strip() + if not line or line.startswith("#"): + continue + + if ";" in line: + link, desc = line.split(";", 1) + link = link.strip() + desc = desc.strip() + else: + link = line + desc = "" + + if not link: + print(f"Skip line {ln_num}: missing link", file=sys.stderr) + continue + + out_base = sanitize_filename(desc) if desc else "file" + + # Detect /c/// or /c// + # We must skip the middle "topic" id if present and use the last part as message id. + chan_msg = None # tuple[channel_id, message_id] + try: + if "/c/" in link: + after_c = link.split("/c/", 1)[1] + # Drop query/fragment if present + after_c = after_c.split("?", 1)[0].split("#", 1)[0] + parts = [p for p in after_c.split("/") if p] + + def _is_numlike(s: str) -> bool: + s2 = s.lstrip("+-") + return s2.isdigit() + + if len(parts) >= 2 and _is_numlike(parts[0]): + channel_id = parts[0] + # If triple or more segments, last segment is the message id; skip middle(s) + message_id = parts[-1] if _is_numlike(parts[-1]) else None + if len(parts) == 2: + # /c// + message_id = parts[1] if _is_numlike(parts[1]) else None + if message_id is not None: + chan_msg = (channel_id, message_id) + except Exception: + chan_msg = None + + if use_inproc and chan_msg: + channel_id, message_id = chan_msg + final_output = generate_unique_filename( # type: ignore[name-defined] + out_base, + bool(args.detailed_name), + f'-{channel_id}-{message_id}', + exclude_names, + ) + exclude_names.append(final_output) + + if args.dry_run: + print(f"DRY INPROC: {link} -> {final_output}") + continue + + print(f"Line {ln_num}: {link} -> {out_base}") + try: + res = rcd_tool.client.loop.run_until_complete( # type: ignore[attr-defined] + rcd_tool.download_media( + channel_id=utils.parse_channel_id(channel_id), # type: ignore[name-defined] + message_id=utils.parse_message_id(message_id), # type: ignore[name-defined] + output_filename=final_output, + infer_extension=args.infer_extension, + workers=args.workers, + part_size_kb=args.part_size_kb, + ) + ) + if res: + print(res) + except Exception as e: + print(f" Error: {e}", file=sys.stderr) + else: + # subprocess mode or non-standard link; fall back to CLI + base_cmd = resolve_executor() + if base_cmd is None: + print("Unable to resolve rcdtool executor", file=sys.stderr) + return 2 + env = os.environ.copy() + if base_cmd[:3] == [sys.executable, "-m", "rcdtool.main"]: + src_path = str((Path.cwd() / "src").resolve()) + env["PYTHONPATH"] = f"{src_path}:{env.get('PYTHONPATH', '')}" if env.get("PYTHONPATH") else src_path + + if chan_msg: + channel_id, message_id = chan_msg + cmd = [*base_cmd, "-c", str(config_path), "-C", channel_id, "-M", message_id, "-O", out_base] + else: + cmd = [*base_cmd, "-c", str(config_path), "--link", link, "-O", out_base] + if args.infer_extension: + cmd.append("--infer-extension") + if args.detailed_name: + cmd.append("--detailed-name") + if args.workers: + cmd += ["--workers", str(args.workers)] + if args.part_size_kb: + cmd += ["--part-size-kb", str(args.part_size_kb)] + + if args.dry_run: + print("DRY:", " ".join(repr(c) if " " in c else c for c in cmd)) + continue + + print(f"Line {ln_num}: {link} -> {out_base}") + try: + proc = subprocess.run(cmd, env=env, check=False) + if proc.returncode != 0: + print(f" Error (exit {proc.returncode}) on line {ln_num}", file=sys.stderr) + except FileNotFoundError as e: + print(f" Executor not found: {e}", file=sys.stderr) + return 127 + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/rcdtool/main.py b/src/rcdtool/main.py index 5a80cec..470ac9a 100644 --- a/src/rcdtool/main.py +++ b/src/rcdtool/main.py @@ -107,6 +107,16 @@ def get_args(): action='store_true', default=False, help='Rename the file with the channel and message ids') + parser.add_argument('--workers', + dest='workers', + type=int, + default=None, + help='Concurrent download workers (default from config or 4)') + parser.add_argument('--part-size-kb', + dest='part_size_kb', + type=int, + default=None, + help='Chunk size in KiB for downloads (default from config or 512)') parser.add_argument('--dry-run', dest='dry_mode', action='store_true', @@ -219,6 +229,8 @@ def main(): message_id=updated_message_id, output_filename=final_output_filename, infer_extension=args.infer_extension, + workers=args.workers, + part_size_kb=args.part_size_kb, discussion_message_id=utils.parse_message_id(args.discussion_message_id) if args.discussion_message_id is not None else None, ) coros.append(coro) diff --git a/src/rcdtool/rcdtool.py b/src/rcdtool/rcdtool.py index 3b05074..0bad684 100644 --- a/src/rcdtool/rcdtool.py +++ b/src/rcdtool/rcdtool.py @@ -67,13 +67,24 @@ def create_client(self): Returns: TelegramClient: The Telegram client object. """ + # Optional tuning knobs with sensible defaults + timeout = int(self.config['Client'].get('timeout', '7000')) + device_model = self.config['Client'].get('device_model', 'scriptgram') + lang_code = self.config['Client'].get('lang_code', 'en-US') + request_retries = int(self.config['Client'].get('request_retries', '5')) + retry_delay = int(self.config['Client'].get('retry_delay', '2')) + connection_retries = int(self.config['Client'].get('connection_retries', '5')) + client = TelegramClient( session=self.config['Access']['session'], api_id=int(self.config['Access']['id']), api_hash=self.config['Access']['hash'], - timeout=int(self.config['Client']['timeout']), - device_model=self.config['Client']['device_model'], - lang_code=self.config['Client']['lang_code'], + timeout=timeout, + device_model=device_model, + lang_code=lang_code, + request_retries=request_retries, + retry_delay=retry_delay, + connection_retries=connection_retries, ) client.start() return client @@ -83,6 +94,8 @@ async def download_media(self, message_id: int, output_filename: str, infer_extension: Optional[bool] = None, + workers: Optional[int] = None, + part_size_kb: Optional[int] = None, discussion_message_id: Optional[int] = None, ): """Read a message in a channel and download the media to output. @@ -120,7 +133,7 @@ async def download_media(self, logger.warning('Cannot continue because the got type is not a Message') return - logger.info('downloading...') + # defer logging until we finalize the target message if discussion_message_id: logger.info('finding message from a discussion group') @@ -168,23 +181,140 @@ async def download_media(self, if self.dry_mode: return output_filename - + + # Try to compute and log media size (when available) before download + def _fmt_bytes(n: int | float) -> tuple[float, str]: + units = ['B', 'KB', 'MB', 'GB', 'TB'] + val = float(n) + idx = 0 + while val >= 1024.0 and idx < len(units) - 1: + val /= 1024.0 + idx += 1 + return val, units[idx] + + def _get_media_size(m) -> Optional[int]: + try: + if isinstance(m, tg_types.MessageMediaDocument) and isinstance(m.document, tg_types.Document): + return int(getattr(m.document, 'size', 0) or 0) or None + if isinstance(m, tg_types.MessageMediaPhoto) and isinstance(m.photo, tg_types.Photo): + sizes = getattr(m.photo, 'sizes', []) or [] + candidates = [getattr(s, 'size', None) for s in sizes] + candidates = [int(x) for x in candidates if isinstance(x, int)] + return max(candidates) if candidates else None + if isinstance(m, tg_types.MessageMediaPaidMedia): + total = 0 + found = False + for em in m.extended_media: + if isinstance(em, tg_types.MessageExtendedMedia): + inner = em.media + if isinstance(inner, tg_types.Document): + total += int(getattr(inner, 'size', 0) or 0) + found = True + elif isinstance(inner, tg_types.Photo): + sizes = getattr(inner, 'sizes', []) or [] + candidates = [getattr(s, 'size', None) for s in sizes] + candidates = [int(x) for x in candidates if isinstance(x, int)] + if candidates: + total += max(candidates) + found = True + return total if found else None + except Exception: + return None + return None + + pre_media = message.media + pre_size = _get_media_size(pre_media) + if pre_size: + s_val, s_unit = _fmt_bytes(pre_size) + logger.info('size: %.2f %s', s_val, s_unit) + logger.info('downloading...') + media = message.media if media is None: logger.warning('No media found') return - with open(output_filename, 'wb+') as file: - if isinstance(media, tg_types.MessageMediaPaidMedia): - logger.debug('paid message found') - for message_extended_media in media.extended_media: - if isinstance(message_extended_media, tg_types.MessageExtendedMedia): - await self.client.download_file(message_extended_media.media, file) - else: - logger.warning('Cannot find a message extended media') - return - else: - await self.client.download_file(media, file) + # Resolve defaults from config if not provided + cfg_workers = int(self.config['Client'].get('workers', '4')) + cfg_part_kb = int(self.config['Client'].get('part_size_kb', '512')) + + # Throttle logs: progress callback every ~1s + import time + last_t = 0.0 + last_b = 0 + + def _fmt_bytes(n: int | float) -> tuple[float, str]: + units = ['B', 'KB', 'MB', 'GB', 'TB'] + val = float(n) + idx = 0 + while val >= 1024.0 and idx < len(units) - 1: + val /= 1024.0 + idx += 1 + return val, units[idx] + + def _progress(bytes_downloaded: int, total: Optional[int]): + nonlocal last_t, last_b + now = time.time() + if last_t == 0.0: + last_t, last_b = now, bytes_downloaded + return + if now - last_t >= 1.0: + delta_b = bytes_downloaded - last_b + speed = delta_b / (now - last_t) + spd_val, spd_unit = _fmt_bytes(speed) + cur_val, cur_unit = _fmt_bytes(bytes_downloaded) + if total: + tot_val, tot_unit = _fmt_bytes(total) + percent = bytes_downloaded * 100 / total + logger.info('progress: %.2f %s/%.2f %s (%.1f%%) at %.2f %s', + cur_val, cur_unit, tot_val, tot_unit, percent, spd_val, spd_unit) + else: + logger.info('progress: %.2f %s at %.2f %s', cur_val, cur_unit, spd_val, spd_unit) + last_t, last_b = now, bytes_downloaded + + # Use low-level download_file for broad Telethon compatibility and control + import inspect + + async def _dl_file(input_media, out_path: str): + sig = None + try: + sig = inspect.signature(self.client.download_file) + except Exception: + sig = None + + kwargs = { + 'file': out_path, + 'part_size_kb': part_size_kb or cfg_part_kb, + 'progress_callback': _progress, + } + # Add workers only if supported in this Telethon version + if sig and 'workers' in sig.parameters: + if workers or cfg_workers: + kwargs['workers'] = workers or cfg_workers + + try: + await self.client.download_file(input_media, **kwargs) + except TypeError: + # Fallback: remove optional kwargs progressively + kwargs.pop('progress_callback', None) + try: + await self.client.download_file(input_media, **kwargs) + except TypeError: + kwargs.pop('part_size_kb', None) + kwargs.pop('workers', None) + await self.client.download_file(input_media, **kwargs) + + if isinstance(media, tg_types.MessageMediaPaidMedia): + logger.debug('paid message found') + for message_extended_media in media.extended_media: + if isinstance(message_extended_media, tg_types.MessageExtendedMedia): + await _dl_file(message_extended_media.media, output_filename) + else: + logger.warning('Cannot find a message extended media') + return + logger.info('downloaded to %s', output_filename) + else: + await _dl_file(media, output_filename) logger.info('downloaded to %s', output_filename) if infer_extension: