diff --git a/README.md b/README.md index 1133d95..3cdbe34 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,18 @@ To construct the `AoPS-Instruct` dataset: ``` This will produce a raw jsonlines file: out/items_raw.jl to be processed in the next steps. - Note: To perform a test run to make sure the whole pipeline is working, in the crawling script, you can add the `test_mode` option to crawl only 1000 datapoints: + **Date Range Options**: You can specify the date range for crawling: + ```bash + # Crawl data from 2025 only + bash scripts/crawl_raw.sh --start_date "2025-01" --end_date "2025-12" + + # Crawl data from a custom range + bash scripts/crawl_raw.sh --start_date "2020-01" --end_date "2025-12" + ``` + + The default configuration crawls **2025 data** (`START_DATE="2025-01"`, `END_DATE="2025-12"`). + + **Test Mode**: To perform a test run to make sure the whole pipeline is working, you can add the `test_mode` option to crawl only 1000 datapoints: ```bash bash scripts/crawl_raw.sh --test_mode True ``` diff --git a/aops_crawler/aops_crawler/spiders/aops_spider.py b/aops_crawler/aops_crawler/spiders/aops_spider.py index fdabd32..f41765a 100755 --- a/aops_crawler/aops_crawler/spiders/aops_spider.py +++ b/aops_crawler/aops_crawler/spiders/aops_spider.py @@ -9,7 +9,7 @@ class AOPSSpider(scrapy.Spider): name = 'aops' - MAX_TOPICS = 3_600_000 + MAX_TOPICS = 5_000_000 # start_urls = [f'https://artofproblemsolving.com/community/c6h{i}' for i in range(1, 500)] def __init__(self, total_spiders=None, spider_idx=None, start_date='2000-01', test_mode=False, **kwargs): diff --git a/scripts/crawl_raw.sh b/scripts/crawl_raw.sh index 600e1c9..f1761bb 100644 --- a/scripts/crawl_raw.sh +++ b/scripts/crawl_raw.sh @@ -3,8 +3,8 @@ set -e # Exit if any command fails # Default values TEST_MODE="False" -START_DATE="2000-01" -END_DATE="2024-12" +START_DATE="2025-01" +END_DATE="2025-12" ITEMS_RAW_PATH="../out/items_raw.jl" ITEMS_RAW_FILTERED_PATH="../out/items_filtered.jl"