From c6e9543bbdb615c4a7924335ab0a819956ed92b3 Mon Sep 17 00:00:00 2001 From: SentienceDEV Date: Wed, 4 Feb 2026 22:25:09 -0800 Subject: [PATCH] timeout param + passive captcha handling --- sentience/agent_runtime.py | 43 ++++++++++ sentience/backends/snapshot.py | 5 +- sentience/models.py | 1 + sentience/snapshot.py | 32 +++++++- tests/test_snapshot_gateway_timeout.py | 108 +++++++++++++++++++++++++ traces/test-run.jsonl | 5 ++ 6 files changed, 189 insertions(+), 5 deletions(-) create mode 100644 tests/test_snapshot_gateway_timeout.py diff --git a/sentience/agent_runtime.py b/sentience/agent_runtime.py index b7cb4a3..a48fc3c 100644 --- a/sentience/agent_runtime.py +++ b/sentience/agent_runtime.py @@ -612,9 +612,52 @@ def _list(name: str) -> list[str]: iframe_hits = _list("iframe_src_hits") url_hits = _list("url_hits") text_hits = _list("text_hits") + selector_hits = _list("selector_hits") + # If we only saw selector/script hints, treat as non-blocking. if not iframe_hits and not url_hits and not text_hits: return False + + # Heuristic: many sites include a passive reCAPTCHA badge (v3) that should NOT block. + # We only want to block when there's evidence of an interactive challenge. + hits_all = [*iframe_hits, *url_hits, *text_hits, *selector_hits] + hits_l = [str(x).lower() for x in hits_all if x] + + strong_text = any( + k in " ".join(hits_l) + for k in ( + "i'm not a robot", + "verify you are human", + "human verification", + "complete the security check", + "please verify", + ) + ) + strong_iframe = any( + any(k in h for k in ("api2/bframe", "hcaptcha", "turnstile")) + for h in hits_l + ) + strong_selector = any( + any( + k in h + for k in ( + "g-recaptcha-response", + "h-captcha-response", + "cf-turnstile-response", + "recaptcha-checkbox", + "hcaptcha-checkbox", + ) + ) + for h in hits_l + ) + only_generic = ( + not strong_text + and not strong_iframe + and not strong_selector + and all(("captcha" in h or "recaptcha" in h) for h in hits_l) + ) + if only_generic: + return False confidence = getattr(captcha, "confidence", 0.0) return confidence >= self._captcha_options.min_confidence diff --git a/sentience/backends/snapshot.py b/sentience/backends/snapshot.py index b09b1cb..f81b363 100644 --- a/sentience/backends/snapshot.py +++ b/sentience/backends/snapshot.py @@ -595,7 +595,10 @@ async def _snapshot_via_api( try: api_result = await _post_snapshot_to_gateway_async( - payload, options.sentience_api_key, api_url + payload, + options.sentience_api_key, + api_url, + timeout_s=options.gateway_timeout_s, ) # Merge API result with local data (screenshot, etc.) diff --git a/sentience/models.py b/sentience/models.py index f1d9a6e..07c062c 100644 --- a/sentience/models.py +++ b/sentience/models.py @@ -776,6 +776,7 @@ class SnapshotOptions(BaseModel): limit: int = Field(50, ge=1, le=500) filter: SnapshotFilter | None = None use_api: bool | None = None # Force API vs extension + gateway_timeout_s: float | None = None # Gateway snapshot timeout (seconds) save_trace: bool = False # Save raw_elements to JSON for benchmarking/training trace_path: str | None = None # Path to save trace (default: "trace_{timestamp}.json") goal: str | None = None # Optional goal/task description for the snapshot diff --git a/sentience/snapshot.py b/sentience/snapshot.py index e0393fd..73e334c 100644 --- a/sentience/snapshot.py +++ b/sentience/snapshot.py @@ -106,6 +106,13 @@ def from_httpx(cls, e: Exception) -> "SnapshotGatewayError": bits.append(f"err_type={type(e).__name__}") if err_s: bits.append(f"err={err_s}") + else: + # Some transport errors (e.g. httpx.ReadError) can stringify to "". + # Include repr() so callers can still see the exception type/shape. + try: + bits.append(f"err_repr={cls._snip(repr(e), 220)}") + except Exception: + pass if bits: msg = f"{msg}: " + " ".join(bits) msg = msg + ". Try using use_api=False to use local extension instead." @@ -162,6 +169,11 @@ def from_requests(cls, e: Exception) -> "SnapshotGatewayError": bits.append(f"err_type={type(e).__name__}") if err_s: bits.append(f"err={err_s}") + else: + try: + bits.append(f"err_repr={cls._snip(repr(e), 220)}") + except Exception: + pass if bits: msg = f"{msg}: " + " ".join(bits) msg = msg + ". Try using use_api=False to use local extension instead." @@ -311,6 +323,8 @@ def _post_snapshot_to_gateway_sync( payload: dict[str, Any], api_key: str, api_url: str = SENTIENCE_API_URL, + *, + timeout_s: float | None = None, ) -> dict[str, Any]: """ Post snapshot payload to gateway (synchronous). @@ -326,11 +340,12 @@ def _post_snapshot_to_gateway_sync( } try: + timeout = 30 if timeout_s is None else float(timeout_s) response = requests.post( f"{api_url}/v1/snapshot", data=payload_json, headers=headers, - timeout=30, + timeout=timeout, ) response.raise_for_status() return response.json() @@ -345,6 +360,8 @@ async def _post_snapshot_to_gateway_async( payload: dict[str, Any], api_key: str, api_url: str = SENTIENCE_API_URL, + *, + timeout_s: float | None = None, ) -> dict[str, Any]: """ Post snapshot payload to gateway (asynchronous). @@ -362,7 +379,8 @@ async def _post_snapshot_to_gateway_async( "Content-Type": "application/json", } - async with httpx.AsyncClient(timeout=30.0) as client: + timeout = 30.0 if timeout_s is None else float(timeout_s) + async with httpx.AsyncClient(timeout=timeout) as client: try: response = await client.post( f"{api_url}/v1/snapshot", @@ -604,7 +622,12 @@ def _snapshot_via_api( payload = _build_snapshot_payload(raw_result, options) try: - api_result = _post_snapshot_to_gateway_sync(payload, api_key, api_url) + api_result = _post_snapshot_to_gateway_sync( + payload, + api_key, + api_url, + timeout_s=options.gateway_timeout_s, + ) # Merge API result with local data (screenshot, etc.) snapshot_data = _merge_api_result_with_local(api_result, raw_result) @@ -923,7 +946,8 @@ async def _snapshot_via_api_async( # Lazy import httpx - only needed for async API calls import httpx - async with httpx.AsyncClient(timeout=30.0) as client: + timeout = 30.0 if options.gateway_timeout_s is None else float(options.gateway_timeout_s) + async with httpx.AsyncClient(timeout=timeout) as client: response = await client.post( f"{api_url}/v1/snapshot", content=payload_json, diff --git a/tests/test_snapshot_gateway_timeout.py b/tests/test_snapshot_gateway_timeout.py new file mode 100644 index 0000000..d115878 --- /dev/null +++ b/tests/test_snapshot_gateway_timeout.py @@ -0,0 +1,108 @@ +import asyncio +import importlib +import sys + +snapshot_module = importlib.import_module("sentience.snapshot") +from sentience.snapshot import _post_snapshot_to_gateway_async, _post_snapshot_to_gateway_sync + + +class _DummyResponse: + def raise_for_status(self): + return None + + def json(self): + return {"status": "success", "elements": [], "url": "https://example.com"} + + +def test_post_snapshot_async_uses_default_timeout(monkeypatch): + class DummyClient: + last_timeout = None + + def __init__(self, timeout): + DummyClient.last_timeout = timeout + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return None + + async def post(self, *args, **kwargs): + return _DummyResponse() + + dummy_httpx = type("DummyHttpx", (), {"AsyncClient": DummyClient}) + monkeypatch.setitem(sys.modules, "httpx", dummy_httpx) + asyncio.run( + _post_snapshot_to_gateway_async( + {"raw_elements": [], "url": "https://example.com", "viewport": None, "goal": None, "options": {}}, + "sk_test", + "https://api.sentienceapi.com", + ) + ) + assert DummyClient.last_timeout == 30.0 + + +def test_post_snapshot_async_uses_custom_timeout(monkeypatch): + class DummyClient: + last_timeout = None + + def __init__(self, timeout): + DummyClient.last_timeout = timeout + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return None + + async def post(self, *args, **kwargs): + return _DummyResponse() + + dummy_httpx = type("DummyHttpx", (), {"AsyncClient": DummyClient}) + monkeypatch.setitem(sys.modules, "httpx", dummy_httpx) + asyncio.run( + _post_snapshot_to_gateway_async( + {"raw_elements": [], "url": "https://example.com", "viewport": None, "goal": None, "options": {}}, + "sk_test", + "https://api.sentienceapi.com", + timeout_s=12.5, + ) + ) + assert DummyClient.last_timeout == 12.5 + + +def test_post_snapshot_sync_uses_default_timeout(monkeypatch): + class DummyRequests: + last_timeout = None + + @staticmethod + def post(*args, **kwargs): + DummyRequests.last_timeout = kwargs.get("timeout") + return _DummyResponse() + + monkeypatch.setattr(snapshot_module, "requests", DummyRequests) + _post_snapshot_to_gateway_sync( + {"raw_elements": [], "url": "https://example.com", "viewport": None, "goal": None, "options": {}}, + "sk_test", + "https://api.sentienceapi.com", + ) + assert DummyRequests.last_timeout == 30 + + +def test_post_snapshot_sync_uses_custom_timeout(monkeypatch): + class DummyRequests: + last_timeout = None + + @staticmethod + def post(*args, **kwargs): + DummyRequests.last_timeout = kwargs.get("timeout") + return _DummyResponse() + + monkeypatch.setattr(snapshot_module, "requests", DummyRequests) + _post_snapshot_to_gateway_sync( + {"raw_elements": [], "url": "https://example.com", "viewport": None, "goal": None, "options": {}}, + "sk_test", + "https://api.sentienceapi.com", + timeout_s=9.0, + ) + assert DummyRequests.last_timeout == 9.0 diff --git a/traces/test-run.jsonl b/traces/test-run.jsonl index e69de29..e1982f7 100644 --- a/traces/test-run.jsonl +++ b/traces/test-run.jsonl @@ -0,0 +1,5 @@ +{"v": 1, "type": "run_start", "ts": "2026-02-05T06:20:59.000Z", "run_id": "test-run", "seq": 1, "data": {"agent": "SentienceAgent"}, "ts_ms": 1770272459843} +{"v": 1, "type": "run_start", "ts": "2026-02-05T06:20:59.000Z", "run_id": "test-run", "seq": 1, "data": {"agent": "SentienceAgent"}, "ts_ms": 1770272459845} +{"v": 1, "type": "run_start", "ts": "2026-02-05T06:20:59.000Z", "run_id": "test-run", "seq": 1, "data": {"agent": "SentienceAgent"}, "ts_ms": 1770272459846} +{"v": 1, "type": "run_start", "ts": "2026-02-05T06:20:59.000Z", "run_id": "test-run", "seq": 1, "data": {"agent": "SentienceAgent"}, "ts_ms": 1770272459848} +{"v": 1, "type": "run_start", "ts": "2026-02-05T06:20:59.000Z", "run_id": "test-run", "seq": 1, "data": {"agent": "SentienceAgent"}, "ts_ms": 1770272459855}