Location: curllm_core/streamware/components/extraction/
Pure LLM-based product extraction system. NO HARDCODED REGEX OR SELECTORS.
Analyzes page type using LLM.
async def analyze_page_type(
page, # Playwright page
llm, # LLM client
run_logger=None # Optional logger
) -> Dict[str, Any]:
"""
Returns:
{
"page_type": "product_listing" | "single_product" | "category" | "homepage" | "other",
"has_products": bool,
"confidence": float, # 0.0 - 1.0
"reasoning": str
}
"""
async def detect_price_format(
page,
llm,
run_logger=None
) -> Dict[str, Any]:
"""
Returns:
{
"format": "text" | "image" | "mixed",
"currency": "PLN" | "EUR" | "USD" | "other",
"sample": str # Example price
}
"""
Finds product containers using LLM.
async def find_product_containers(
page,
llm,
instruction: str = "",
run_logger=None
) -> Dict[str, Any]:
"""
Returns:
{
"found": bool,
"containers": [
{"selector": str, "count": int, "confidence": float}
],
"best": {"selector": str, "reasoning": str}
}
"""
async def analyze_container_content(
page,
llm,
container_selector: str,
run_logger=None
) -> Dict[str, Any]:
"""
Returns:
{
"is_product_container": bool,
"content_type": "products" | "navigation" | "articles" | "other",
"has_prices": bool,
"has_names": bool,
"reasoning": str
}
"""
Detects product fields using LLM.
async def detect_product_fields(
page,
llm,
container_selector: str,
run_logger=None
) -> Dict[str, Any]:
"""
Returns:
{
"found": bool,
"fields": {
"name": {"selector": str, "sample": str},
"price": {"selector": str, "sample": str},
"url": {"selector": str, "sample": str},
"image": {"selector": str, "sample": str}
},
"completeness": float # 0.0 - 1.0
}
"""
async def extract_field_value(
page,
llm,
container_element,
field_name: str,
run_logger=None
) -> Optional[str]:
"""
Extract single field value using LLM when selector fails.
"""
LLM generates patterns/selectors dynamically.
async def generate_price_pattern(page, llm, run_logger=None) -> Dict[str, Any]:
"""
LLM generates regex pattern for prices based on actual page content.
Returns:
{
"pattern": str, # Regex pattern
"currency": str,
"format": str,
"examples": [str]
}
"""
async def generate_product_link_pattern(page, llm, run_logger=None) -> Dict[str, Any]:
"""
LLM generates regex pattern for product links.
Returns:
{
"pattern": str, # Regex for URLs
"selector": str, # CSS selector
"examples": [str]
}
"""
async def generate_container_selector(
page,
llm,
content_type: str = "products",
run_logger=None
) -> Dict[str, Any]:
"""
Returns:
{
"selector": str,
"count": int,
"confidence": float,
"alternatives": [str]
}
"""
async def generate_extraction_strategy(
page,
llm,
instruction: str,
run_logger=None
) -> Dict[str, Any]:
"""
Complete extraction strategy from LLM.
Returns:
{
"container_selector": str,
"fields": {
"name": {"selector": str, "type": "text"},
"price": {"selector": str, "type": "text|image"},
"url": {"selector": str, "type": "attribute:href"}
},
"filters": {
"price_max": float | None,
"price_min": float | None
},
"confidence": float
}
"""
Main extraction orchestrator.
class LLMIterativeExtractor:
"""
Pipeline:
1. analyze_page_type() - Determine page type
2. find_product_containers() - Find containers
3. detect_product_fields() - Identify fields
4. extract_products() - Extract each product
5. filter_products() - Apply filters
"""
async def run(self, max_items: int = 50) -> Dict[str, Any]:
"""
Returns:
{
"products": [{name, price, url, image}, ...],
"count": int,
"reason": str,
"metadata": {...}
}
"""
async def llm_extract_products(
page,
llm,
instruction: str,
max_items: int = 50,
run_logger=None
) -> Dict[str, Any]:
"""Convenience function for extraction."""
Location: curllm_core/streamware/components/form/
Intelligent form filling with LLM verification.
class SmartFormOrchestrator:
"""
Features:
- Field type validation
- Per-field verification
- Post-submit evaluation
- Retry mechanisms
- CAPTCHA handling
"""
async def orchestrate(
self,
user_data: Dict[str, str],
instruction: str = ""
) -> Dict[str, Any]:
"""
Returns:
{
"success": bool,
"fields_filled": int,
"submission_result": {...},
"captcha_handled": bool,
"retries": int
}
"""
async def _detect_fields(page) -> List[Dict]:
"""Detect all form fields on page."""
async def _smart_map_fields(fields, data, instruction) -> List[Dict]:
"""LLM-based field mapping."""
async def _fill_with_verification(mapping) -> Dict:
"""Fill fields and verify each one."""
async def _detect_captcha(page) -> bool:
"""Detect CAPTCHA presence."""
async def _handle_captcha(page) -> bool:
"""Solve CAPTCHA using Vision LLM."""
async def _submit_with_evaluation(page) -> Dict:
"""Submit and evaluate success with LLM."""
Location: curllm_core/streamware/components/llm_decision.py
async def extract_fields_from_instruction_llm(
llm,
instruction: str,
available_fields: List[str] = None,
run_logger=None
) -> Dict[str, str]:
"""
Extract field=value pairs from instruction.
Example:
Input: "fill form with email=test@example.com, name=John"
Output: {"email": "test@example.com", "name": "John"}
"""
async def parse_selector_name_llm(
llm,
selector: str,
run_logger=None
) -> Optional[str]:
"""
Extract field name from CSS selector.
Example:
Input: "[name='email-input']"
Output: "email-input"
"""
async def plan_next_action_llm(
llm,
instruction: str,
page_context: Dict[str, Any],
history: List[Dict] = None,
run_logger=None
) -> Dict[str, Any]:
"""
Returns:
{
"type": "fill" | "click" | "wait" | "complete",
"selector": str,
"value": str,
"reason": str
}
"""
async def validate_action_llm(
llm,
action: Dict[str, Any],
before_state: Dict[str, Any],
after_state: Dict[str, Any],
run_logger=None
) -> Dict[str, Any]:
"""
Returns:
{
"success": bool,
"reason": str,
"confidence": float
}
"""
async def interpret_instruction_llm(
llm,
instruction: str,
run_logger=None
) -> Dict[str, Any]:
"""
Returns:
{
"intent": "fill_form" | "navigate" | "extract_data" | "click" | "search",
"url": str | None,
"form_data": Dict[str, str],
"expected_outcome": str,
"steps": [str]
}
"""
async def generate_selector_for_field_llm(
llm,
field_name: str,
form_fields: List[Dict],
run_logger=None
) -> Optional[str]:
"""Generate CSS selector for a semantic field name."""
Location: curllm_core/streamware/components/bql/
class BQLParser:
"""Parse BQL queries into executable AST."""
def parse(self, query: str) -> BQLNode:
"""Parse BQL string into node tree."""
class BQLExecutor:
"""Execute parsed BQL queries."""
async def execute(self, node: BQLNode, page) -> Any:
"""Execute BQL node on page."""
QUERY products FROM "https://shop.example.com/category" {
name: .product-name
price: .price-value
url: a.product-link@href
image: img.product-image@src
}
WHERE price < 500
ORDER BY price ASC
LIMIT 20
Location: curllm_core/streamware/components/vision/
class VisionAnalyzer:
"""OpenCV-based visual analysis."""
def detect_captcha_patterns(self, image_path: str) -> Dict:
"""Detect CAPTCHA patterns in image."""
def detect_distorted_text(self, image_path: str) -> bool:
"""Detect distorted/warped text using FFT."""
async def analyze_form_fields_vision(
llm,
screenshot_path: str,
run_logger=None
) -> Dict[str, Any]:
"""
Analyze form using Vision LLM.
Returns:
{
"visible_fields": [...],
"honeypots": [...],
"recommended_fill_order": [...],
"captcha_detected": bool
}
"""
async def solve_captcha_with_vision(
page,
llm,
screenshot_path: str
) -> Dict[str, Any]:
"""
Solve CAPTCHA using Vision LLM.
Returns:
{
"solved": bool,
"answer": str,
"confidence": float
}
"""
Location: curllm_core/streamware/components/browser/
async def apply_stealth_mode(page) -> None:
"""Apply anti-detection measures."""
async def randomize_viewport(page) -> None:
"""Randomize viewport size."""
async def inject_stealth_scripts(page) -> None:
"""Inject scripts to evade bot detection."""
async def navigate_with_retry(
page,
url: str,
max_retries: int = 3
) -> bool:
"""Navigate with automatic retry on failure."""
async def wait_for_load(page, timeout: int = 30000) -> bool:
"""Wait for page to fully load."""
Location: curllm_core/streamware/components/llm/
class SimpleOllama:
"""Minimal Ollama client for LLM calls."""
async def ainvoke(self, prompt: str) -> str:
"""Generate text response."""
async def ainvoke_with_image(
self,
prompt: str,
image_path: str
) -> str:
"""Generate response with image (Vision LLM)."""
def setup_llm() -> Any:
"""Initialize LLM client based on configuration."""
def get_llm() -> Any:
"""Get or create global LLM instance."""
import asyncio
from playwright.async_api import async_playwright
from curllm_core.streamware.components.extraction import llm_extract_products
from curllm_core.llm import SimpleOllama
async def extract_products():
llm = SimpleOllama()
async with async_playwright() as p:
browser = await p.chromium.launch()
page = await browser.new_page()
await page.goto("https://example-shop.com/products")
result = await llm_extract_products(
page=page,
llm=llm,
instruction="Find all electronics under $500"
)
for product in result["products"]:
print(f"{product['name']}: ${product['price']}")
await browser.close()
asyncio.run(extract_products())
import asyncio
from playwright.async_api import async_playwright
from curllm_core.streamware.components.form import SmartFormOrchestrator
from curllm_core.llm import SimpleOllama
async def fill_contact_form():
llm = SimpleOllama()
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
page = await browser.new_page()
await page.goto("https://example.com/contact")
orchestrator = SmartFormOrchestrator(page, llm)
result = await orchestrator.orchestrate(
user_data={
"name": "John Doe",
"email": "john@example.com",
"message": "Hello, this is a test message."
},
instruction="Fill and submit the contact form"
)
print(f"Success: {result['success']}")
await browser.close()
asyncio.run(fill_contact_form())
Components are registered using decorators:
from curllm_core.streamware.registry import register
@register("my-component")
class MyComponent(Component):
"""Custom component."""
def process(self, data: Any) -> Dict[str, Any]:
# Implementation
pass
Usage:
from curllm_core.streamware.registry import create_component
component = create_component("my-component://action?param=value")
result = component.process(input_data)