diff --git a/tests/e2e/test_scrapy/__init__.py b/tests/e2e/test_scrapy/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/tests/e2e/test_scrapy/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/e2e/test_scrapy/actor_source/__init__.py b/tests/e2e/test_scrapy/actor_source/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/e2e/test_scrapy/actor_source/__main__.py b/tests/e2e/test_scrapy/actor_source/__main__.py new file mode 100644 index 00000000..edfdaae5 --- /dev/null +++ b/tests/e2e/test_scrapy/actor_source/__main__.py @@ -0,0 +1,17 @@ +from __future__ import annotations + +from scrapy.utils.reactor import install_reactor + +install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor') + +import os # noqa: E402, I001 + +from apify.scrapy import initialize_logging, run_scrapy_actor # noqa: E402 + +from .main import main # noqa: E402 + +os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings' + +if __name__ == '__main__': + initialize_logging() + run_scrapy_actor(main()) diff --git a/tests/e2e/test_scrapy/actor_source/items.py b/tests/e2e/test_scrapy/actor_source/items.py new file mode 100644 index 00000000..cb57ca27 --- /dev/null +++ b/tests/e2e/test_scrapy/actor_source/items.py @@ -0,0 +1,10 @@ +from __future__ import annotations + +from scrapy import Field, Item + + +class ProductItem(Item): + name = Field() + url = Field() + price = Field() + description = Field() diff --git a/tests/e2e/test_scrapy/actor_source/main.py b/tests/e2e/test_scrapy/actor_source/main.py new file mode 100644 index 00000000..173b5961 --- /dev/null +++ b/tests/e2e/test_scrapy/actor_source/main.py @@ -0,0 +1,16 @@ +from __future__ import annotations # noqa: I001 + +from scrapy.crawler import CrawlerRunner +from scrapy.utils.defer import deferred_to_future + +from apify import Actor +from apify.scrapy import apply_apify_settings + +from .spiders import Spider # ty: ignore[unresolved-import] + + +async def main() -> None: + async with Actor: + settings = apply_apify_settings() + runner = CrawlerRunner(settings) + await deferred_to_future(runner.crawl(Spider, start_urls=['http://localhost:8080/'])) diff --git a/tests/e2e/test_scrapy/actor_source/main_custom_pipeline.py b/tests/e2e/test_scrapy/actor_source/main_custom_pipeline.py new file mode 100644 index 00000000..39250de3 --- /dev/null +++ b/tests/e2e/test_scrapy/actor_source/main_custom_pipeline.py @@ -0,0 +1,19 @@ +from __future__ import annotations # noqa: I001 + +import os + +from scrapy.crawler import CrawlerRunner +from scrapy.utils.defer import deferred_to_future + +from apify import Actor +from apify.scrapy import apply_apify_settings + +from .spiders import Spider # ty: ignore[unresolved-import] + + +async def main() -> None: + async with Actor: + os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings_custom_pipeline' + settings = apply_apify_settings() + runner = CrawlerRunner(settings) + await deferred_to_future(runner.crawl(Spider, start_urls=['http://localhost:8080/'])) diff --git a/tests/e2e/test_scrapy/actor_source/pipelines.py b/tests/e2e/test_scrapy/actor_source/pipelines.py new file mode 100644 index 00000000..2367baf1 --- /dev/null +++ b/tests/e2e/test_scrapy/actor_source/pipelines.py @@ -0,0 +1,17 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from scrapy import Item, Spider + + +class PriceCleanerPipeline: + def process_item( + self, + item: Item, + _: Spider, + ) -> Item: + if 'price' in item and isinstance(item['price'], str): + item['price'] = item['price'].lstrip('$') + return item diff --git a/tests/e2e/test_scrapy/actor_source/server.py b/tests/e2e/test_scrapy/actor_source/server.py new file mode 100644 index 00000000..20aff81a --- /dev/null +++ b/tests/e2e/test_scrapy/actor_source/server.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +import asyncio +import logging +from collections.abc import Awaitable, Callable, Coroutine +from typing import Any + +from uvicorn import Config +from uvicorn.server import Server + +Receive = Callable[[], Awaitable[dict[str, Any]]] +Send = Callable[[dict[str, Any]], Coroutine[None, None, None]] + +_PRODUCTS = { + '1': {'name': 'Widget A', 'price': '$19.99', 'description': 'A basic widget for everyday use'}, + '2': {'name': 'Widget B', 'price': '$29.99', 'description': 'An advanced widget with extra features'}, + '3': {'name': 'Widget C', 'price': '$39.99', 'description': 'A premium widget for professionals'}, +} + + +async def _send_html(send: Send, html: str, status: int = 200) -> None: + await send( + { + 'type': 'http.response.start', + 'status': status, + 'headers': [[b'content-type', b'text/html; charset=utf-8']], + } + ) + await send({'type': 'http.response.body', 'body': html.encode()}) + + +async def app(scope: dict[str, Any], _receive: Receive, send: Send) -> None: + assert scope['type'] == 'http' + path = scope['path'] + + if path == '/': + await _send_html( + send, + '
{product["description"]}
' + f'Back to Home' + f'', + ) + else: + await _send_html(send, 'Not Found', 404) + elif path == '/about': + await _send_html( + send, + 'We sell the best widgets in the world.
' + 'Back to Home' + '', + ) + else: + await _send_html(send, 'Not Found', 404) + + +if __name__ == '__main__': + asyncio.run( + Server( + config=Config( + app=app, + lifespan='off', + loop='asyncio', + port=8080, + log_config=None, + log_level=logging.CRITICAL, + ) + ).serve() + ) diff --git a/tests/e2e/test_scrapy/actor_source/settings.py b/tests/e2e/test_scrapy/actor_source/settings.py new file mode 100644 index 00000000..860f9432 --- /dev/null +++ b/tests/e2e/test_scrapy/actor_source/settings.py @@ -0,0 +1,8 @@ +BOT_NAME = 'testbot' +LOG_LEVEL = 'INFO' +NEWSPIDER_MODULE = 'src.spiders' +ROBOTSTXT_OBEY = False +SPIDER_MODULES = ['src.spiders'] +TELNETCONSOLE_ENABLED = False +TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor' +HTTPCACHE_ENABLED = False diff --git a/tests/e2e/test_scrapy/actor_source/settings_custom_pipeline.py b/tests/e2e/test_scrapy/actor_source/settings_custom_pipeline.py new file mode 100644 index 00000000..7dfa245d --- /dev/null +++ b/tests/e2e/test_scrapy/actor_source/settings_custom_pipeline.py @@ -0,0 +1,5 @@ +from src.settings import * # noqa: F403 # ty: ignore[unresolved-import] + +ITEM_PIPELINES = { + 'src.pipelines.PriceCleanerPipeline': 100, +} diff --git a/tests/e2e/test_scrapy/actor_source/spider_basic.py b/tests/e2e/test_scrapy/actor_source/spider_basic.py new file mode 100644 index 00000000..546a0c34 --- /dev/null +++ b/tests/e2e/test_scrapy/actor_source/spider_basic.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from scrapy import Request, Spider + +if TYPE_CHECKING: + from collections.abc import Generator + + from scrapy.http.response import Response + + +class BasicSpider(Spider): + name = 'basic_spider' + + def __init__(self, start_urls: list[str], *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.start_urls = start_urls + + def start_requests(self) -> Generator[Request, None, None]: + for url in self.start_urls: + yield Request(url, callback=self.parse) + + def parse(self, response: Response) -> Generator[dict | Request, None, None]: + for link in response.css('a[href*="/products/"]::attr(href)').getall(): + yield response.follow(link, callback=self.parse_product) + + def parse_product(self, response: Response) -> Generator[dict, None, None]: + yield { + 'url': response.url, + 'name': response.css('h1::text').get(''), + 'price': response.css('span.price::text').get(''), + 'description': response.css('p.description::text').get(''), + } diff --git a/tests/e2e/test_scrapy/actor_source/spider_cb_kwargs.py b/tests/e2e/test_scrapy/actor_source/spider_cb_kwargs.py new file mode 100644 index 00000000..c62b105c --- /dev/null +++ b/tests/e2e/test_scrapy/actor_source/spider_cb_kwargs.py @@ -0,0 +1,36 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from scrapy import Request, Spider + +if TYPE_CHECKING: + from collections.abc import Generator + + from scrapy.http.response import Response + + +class CbKwargsSpider(Spider): + name = 'cb_kwargs_spider' + + def __init__(self, start_urls: list[str], *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.start_urls = start_urls + + def start_requests(self) -> Generator[Request, None, None]: + for url in self.start_urls: + yield Request(url, callback=self.parse) + + def parse(self, response: Response) -> Generator[Request, None, None]: + source = response.css('title::text').get('') + for link in response.css('a[href*="/products/"]::attr(href)').getall(): + yield response.follow(link, callback=self.parse_product, cb_kwargs={'source': source}) + + def parse_product(self, response: Response, source: str) -> Generator[dict, None, None]: + yield { + 'url': response.url, + 'name': response.css('h1::text').get(''), + 'price': response.css('span.price::text').get(''), + 'description': response.css('p.description::text').get(''), + 'source': source, + } diff --git a/tests/e2e/test_scrapy/actor_source/spider_crawl.py b/tests/e2e/test_scrapy/actor_source/spider_crawl.py new file mode 100644 index 00000000..79d690eb --- /dev/null +++ b/tests/e2e/test_scrapy/actor_source/spider_crawl.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from scrapy.linkextractors import LinkExtractor +from scrapy.spiders import CrawlSpider, Rule + +if TYPE_CHECKING: + from collections.abc import Generator + + from scrapy.http.response import Response + + +class CrawlProductSpider(CrawlSpider): + name = 'crawl_product_spider' + + rules = (Rule(LinkExtractor(allow=r'/products/'), callback='parse_product'),) + + def __init__(self, start_urls: list[str], *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.start_urls = start_urls + + def parse_product(self, response: Response) -> Generator[dict, None, None]: + yield { + 'url': response.url, + 'name': response.css('h1::text').get(''), + 'price': response.css('span.price::text').get(''), + 'description': response.css('p.description::text').get(''), + } diff --git a/tests/e2e/test_scrapy/actor_source/spider_custom_pipeline.py b/tests/e2e/test_scrapy/actor_source/spider_custom_pipeline.py new file mode 100644 index 00000000..e16b3f54 --- /dev/null +++ b/tests/e2e/test_scrapy/actor_source/spider_custom_pipeline.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from scrapy import Request, Spider + +if TYPE_CHECKING: + from collections.abc import Generator + + from scrapy.http.response import Response + + +class CustomPipelineSpider(Spider): + name = 'custom_pipeline_spider' + + def __init__(self, start_urls: list[str], *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.start_urls = start_urls + + def start_requests(self) -> Generator[Request, None, None]: + for url in self.start_urls: + yield Request(url, callback=self.parse) + + def parse(self, response: Response) -> Generator[Request, None, None]: + for link in response.css('a[href*="/products/"]::attr(href)').getall(): + yield response.follow(link, callback=self.parse_product) + + def parse_product(self, response: Response) -> Generator[dict, None, None]: + yield { + 'url': response.url, + 'name': response.css('h1::text').get(''), + 'price': response.css('span.price::text').get(''), + 'description': response.css('p.description::text').get(''), + } diff --git a/tests/e2e/test_scrapy/actor_source/spider_itemloader.py b/tests/e2e/test_scrapy/actor_source/spider_itemloader.py new file mode 100644 index 00000000..aeae3090 --- /dev/null +++ b/tests/e2e/test_scrapy/actor_source/spider_itemloader.py @@ -0,0 +1,46 @@ +from __future__ import annotations # noqa: I001 + +from typing import TYPE_CHECKING, Any + +from itemloaders.processors import MapCompose, TakeFirst +from scrapy import Request, Spider +from scrapy.loader import ItemLoader + +from src.items import ProductItem # ty: ignore[unresolved-import] + +if TYPE_CHECKING: + from collections.abc import Generator + + from scrapy.http.response import Response + + +class ProductItemLoader(ItemLoader): + default_item_class = ProductItem + default_output_processor = TakeFirst() + name_in = MapCompose(str.strip) + price_in = MapCompose(str.strip) + description_in = MapCompose(str.strip) + + +class ItemLoaderSpider(Spider): + name = 'itemloader_spider' + + def __init__(self, start_urls: list[str], *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.start_urls = start_urls + + def start_requests(self) -> Generator[Request, None, None]: + for url in self.start_urls: + yield Request(url, callback=self.parse) + + def parse(self, response: Response) -> Generator[Request, None, None]: + for link in response.css('a[href*="/products/"]::attr(href)').getall(): + yield response.follow(link, callback=self.parse_product) + + def parse_product(self, response: Response) -> Generator[ProductItem, None, None]: + loader = ProductItemLoader(response=response) # ty: ignore[invalid-argument-type] + loader.add_value('url', response.url) + loader.add_css('name', 'h1::text') + loader.add_css('price', 'span.price::text') + loader.add_css('description', 'p.description::text') + yield loader.load_item() diff --git a/tests/e2e/test_scrapy/conftest.py b/tests/e2e/test_scrapy/conftest.py new file mode 100644 index 00000000..f5c0cc10 --- /dev/null +++ b/tests/e2e/test_scrapy/conftest.py @@ -0,0 +1,65 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from apify_client.clients.resource_clients import ActorClientAsync + + from apify._models import ActorRun + +_ACTOR_SOURCE_DIR = Path(__file__).parent / 'actor_source' + + +def read_actor_source(filename: str) -> str: + return (_ACTOR_SOURCE_DIR / filename).read_text() + + +def get_scrapy_source_files( + spider_file: str, + spider_class_name: str, + *, + extra_source_files: dict[str, str] | None = None, +) -> dict[str, str]: + source_files: dict[str, str] = { + 'server.py': read_actor_source('server.py'), + 'src/__main__.py': read_actor_source('__main__.py'), + 'src/main.py': read_actor_source('main.py'), + 'src/settings.py': read_actor_source('settings.py'), + 'src/items.py': read_actor_source('items.py'), + 'src/spiders/__init__.py': f'from .spider import {spider_class_name} as Spider\n', + 'src/spiders/spider.py': read_actor_source(spider_file), + } + if extra_source_files: + source_files.update(extra_source_files) + return source_files + + +_EXPECTED_PRODUCTS = { + 'Widget A': {'price': '$19.99', 'description': 'A basic widget for everyday use'}, + 'Widget B': {'price': '$29.99', 'description': 'An advanced widget with extra features'}, + 'Widget C': {'price': '$39.99', 'description': 'A premium widget for professionals'}, +} + + +async def verify_spider_results( + actor: ActorClientAsync, + run_result: ActorRun, + *, + expected_products: dict[str, dict[str, str]] | None = None, +) -> None: + assert run_result.status == 'SUCCEEDED' + + products = expected_products or _EXPECTED_PRODUCTS + + items = await actor.last_run().dataset().list_items() + assert items.count == 3 + + items_by_name = {item['name']: item for item in items.items} + + for name, expected in products.items(): + assert name in items_by_name, f'Missing product: {name}' + item = items_by_name[name] + assert 'url' in item + for key, value in expected.items(): + assert item[key] == value, f'Product {name}: expected {key}={value!r}, got {item[key]!r}' diff --git a/tests/e2e/test_scrapy/test_basic_spider.py b/tests/e2e/test_scrapy/test_basic_spider.py new file mode 100644 index 00000000..b0a69c07 --- /dev/null +++ b/tests/e2e/test_scrapy/test_basic_spider.py @@ -0,0 +1,18 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from .conftest import get_scrapy_source_files, verify_spider_results + +if TYPE_CHECKING: + from ..conftest import MakeActorFunction, RunActorFunction + + +async def test_basic_spider(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: + actor = await make_actor( + label='scrapy-basic', + source_files=get_scrapy_source_files('spider_basic.py', 'BasicSpider'), + additional_requirements=['scrapy~=2.12.0'], + ) + run_result = await run_actor(actor) + await verify_spider_results(actor, run_result) diff --git a/tests/e2e/test_scrapy/test_cb_kwargs_spider.py b/tests/e2e/test_scrapy/test_cb_kwargs_spider.py new file mode 100644 index 00000000..599dd6d2 --- /dev/null +++ b/tests/e2e/test_scrapy/test_cb_kwargs_spider.py @@ -0,0 +1,36 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from .conftest import get_scrapy_source_files, verify_spider_results + +if TYPE_CHECKING: + from ..conftest import MakeActorFunction, RunActorFunction + +_EXPECTED_PRODUCTS = { + 'Widget A': { + 'price': '$19.99', + 'description': 'A basic widget for everyday use', + 'source': 'E-commerce Test Store', + }, + 'Widget B': { + 'price': '$29.99', + 'description': 'An advanced widget with extra features', + 'source': 'E-commerce Test Store', + }, + 'Widget C': { + 'price': '$39.99', + 'description': 'A premium widget for professionals', + 'source': 'E-commerce Test Store', + }, +} + + +async def test_cb_kwargs_spider(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: + actor = await make_actor( + label='scrapy-cb-kwargs', + source_files=get_scrapy_source_files('spider_cb_kwargs.py', 'CbKwargsSpider'), + additional_requirements=['scrapy~=2.12.0'], + ) + run_result = await run_actor(actor) + await verify_spider_results(actor, run_result, expected_products=_EXPECTED_PRODUCTS) diff --git a/tests/e2e/test_scrapy/test_crawl_spider.py b/tests/e2e/test_scrapy/test_crawl_spider.py new file mode 100644 index 00000000..f4c3b7b2 --- /dev/null +++ b/tests/e2e/test_scrapy/test_crawl_spider.py @@ -0,0 +1,18 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from .conftest import get_scrapy_source_files, verify_spider_results + +if TYPE_CHECKING: + from ..conftest import MakeActorFunction, RunActorFunction + + +async def test_crawl_spider(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: + actor = await make_actor( + label='scrapy-crawl', + source_files=get_scrapy_source_files('spider_crawl.py', 'CrawlProductSpider'), + additional_requirements=['scrapy~=2.12.0'], + ) + run_result = await run_actor(actor) + await verify_spider_results(actor, run_result) diff --git a/tests/e2e/test_scrapy/test_custom_pipeline_spider.py b/tests/e2e/test_scrapy/test_custom_pipeline_spider.py new file mode 100644 index 00000000..65f03cc9 --- /dev/null +++ b/tests/e2e/test_scrapy/test_custom_pipeline_spider.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from .conftest import get_scrapy_source_files, read_actor_source, verify_spider_results + +if TYPE_CHECKING: + from ..conftest import MakeActorFunction, RunActorFunction + +_EXPECTED_PRODUCTS = { + 'Widget A': {'price': '19.99', 'description': 'A basic widget for everyday use'}, + 'Widget B': {'price': '29.99', 'description': 'An advanced widget with extra features'}, + 'Widget C': {'price': '39.99', 'description': 'A premium widget for professionals'}, +} + + +async def test_custom_pipeline_spider(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: + actor = await make_actor( + label='scrapy-pipeline', + source_files=get_scrapy_source_files( + 'spider_custom_pipeline.py', + 'CustomPipelineSpider', + extra_source_files={ + 'src/main.py': read_actor_source('main_custom_pipeline.py'), + 'src/settings_custom_pipeline.py': read_actor_source('settings_custom_pipeline.py'), + 'src/pipelines.py': read_actor_source('pipelines.py'), + }, + ), + additional_requirements=['scrapy~=2.12.0'], + ) + run_result = await run_actor(actor) + await verify_spider_results(actor, run_result, expected_products=_EXPECTED_PRODUCTS) diff --git a/tests/e2e/test_scrapy/test_itemloader_spider.py b/tests/e2e/test_scrapy/test_itemloader_spider.py new file mode 100644 index 00000000..1b486d66 --- /dev/null +++ b/tests/e2e/test_scrapy/test_itemloader_spider.py @@ -0,0 +1,18 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from .conftest import get_scrapy_source_files, verify_spider_results + +if TYPE_CHECKING: + from ..conftest import MakeActorFunction, RunActorFunction + + +async def test_itemloader_spider(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: + actor = await make_actor( + label='scrapy-itemloader', + source_files=get_scrapy_source_files('spider_itemloader.py', 'ItemLoaderSpider'), + additional_requirements=['scrapy~=2.12.0'], + ) + run_result = await run_actor(actor) + await verify_spider_results(actor, run_result)