PyPI Package: olostep | Requirements: Python 3.11+
Installation
Copy
pip install olostep
Authentication
Get your API key from olostep.com/authCopy
from olostep import OlostepClient
client = OlostepClient(api_key="YOUR_API_KEY")
Quick Start
Copy
"""
The quickstart uses the async/await interface as it's the default and generally preferred.
* If you need a blocking interface scroll to the end of this codeblock.
* If you want to see the full interfaces scroll to the next section.
"""
from olostep import OlostepClient
# Provide the API key either via passing in the 'api_key' parameter or
# by setting the OLOSTEP_API_KEY environment variable
client = OlostepClient(api_key="YOUR_REAL_KEY")
# MINIMAL SCRAPE EXAMPLE
scrape_result = await client.scrape("https://example.com")
# -> ScrapeResult(id='scrape_123', available=['html_content', 'markdown_content'])
# MINIMAL BATCH EXAMPLE
batch = await client.batch(["https://site1.com", "https://site2.com"])
# -> Batch(id='batch_123', urls=2)
# waits for all the batch jobs to finish, then starts fetching the results in batches
async for item in batch.items():
content = await item.retrieve(["html"])
print(f"{item.url}: {len(content.html_content)} bytes")
# MINIMAL CRAWL EXAMPLE
crawl = await client.crawl("https://example.com", max_pages=100)
# -> Crawl(id='crawl_123', urls=100)
async for page in crawl.pages():
content = await page.retrieve(["html"])
print(f"{page.url}: {len(content.html_content)} bytes")
# SYNC (FACADE) CLIENT
# this client is just a wrapper around the async client.
# The interface is the same, just don't use await.
# If you can use the OlostepClient instead, do so.
from olostep import SyncOlostepClient
client = SyncOlostepClient(api_key="YOUR_REAL_KEY")
scrape_result = client.scrape("https://example.com")
# -> ScrapeResult(id='scrape_123', available=['html_content', 'markdown_content'])
Usage
The SDK provides a clean, Pythonic interface organized into logical namespaces. Each operation returns stateful objects with ergonomic methods for follow-up operations.Scraping
Copy
from olostep import OlostepClient
from olostep import Country, FillInputAction, Format, LLMExtract, LinksOnPage, ScreenSize, Transformer, WaitAction
client = OlostepClient(api_key="YOUR_REAL_KEY")
# Minimal: Just scrape a URL
result = await client.scrape("https://example.com")
# ScrapeResult(id='scrape_123', available=['html_content', 'markdown_content'])
# Maximal: Full control over scraping behavior
result = await client.scrape(
"https://example.com",
wait_before_scraping=3000,
formats=[Format.HTML, Format.MARKDOWN],
remove_css_selectors=["script", ".popup"],
actions=[
WaitAction(milliseconds=1500),
FillInputAction(selector="searchbox", value="olostep")
],
country=Country.US,
transformer=Transformer("postlight"),
remove_images=True,
remove_class_names=["ad"],
parser="VALID_PARSER", # check website for valid parsers
llm_extract=LLMExtract(schema="YOUR_SCHEMA"),
links_on_page=LinksOnPage(
absolute_links=False,
query_to_order_links_by='cars',
include_links=["/events/**", "/offers/**"],
exclude_links=[".pdf"]
),
screen_size=ScreenSize(screen_width=1920, screen_height=1080),
metadata={"custom": "sidecart_data"} # Not supported yet
)
Batch Processing
Copy
from olostep import OlostepClient
from olostep import BatchItem, Country
client = OlostepClient(api_key="YOUR_REAL_KEY")
# Minimal: Process a list of URLs
batch = await client.batch(["https://site1.com", "https://site2.com"])
# Batch(id='batch_123', urls=2)
# Maximal: Advanced batch with custom IDs and options
batch = await client.batch(
[
BatchItem(url="https://www.google.com/search?q=olostep"),
BatchItem(url="https://www.google.com/search?q=olostep+api", custom_id="news_2")
],
country=Country.US,
parser_id="@olostep/google-search"
)
# This is optional but you can check on the process of your batch at any time with:
info = await batch.info()
# -> BatchInfo(id='batch_123', status='in_progress', completed=1/2, age=2h ago)
# Also optional: Wait for completion.
# Pass in `check_every_n_secs=` to change interval, default 10
await batch.wait_till_done()
# Note: batch.items() automatically checks if the batch is completed before starting to return elements (can be disabled by passing in `wait_for_completion=False`)
async for item in batch.items(batch_size=10):
content = await item.retrieve(["html", "json"]) # json from the parser
print(f"{item.custom_id}: {len(content.html_content)} bytes")
# Alternative: Direct API access (stateless)
async for item in client.batch.items(batch_id='a_batch_id', batch_size=10):
content = await item.retrieve(["html", "json"])
print(f"{item.custom_id}: {len(content.html_content)} bytes")
Web Crawling
Copy
# Minimal: Crawl a site with default settings
crawl = await client.crawl("https://example.com", max_pages=100)
# Crawl(id='crawl_123', urls=100)
# Maximal: Advanced crawling with filters and limits
crawl = await client.crawl(
"https://example.com",
max_pages=1000,
max_depth=3,
include_urls=["/articles/**", "/news/**"],
exclude_urls=["/ads/**", "/tracking/**"],
include_external=False,
include_subdomain=True,
search_query="hot shingles",
top_n=50
)
# This is optional but you can check on the process of your crawl at any time with:
info = await crawl.info() # CrawlInfo(id='crawl_123', status='in_progress', pages_count=42, age=15m ago)
# Also optional: Wait for completion.
# Pass in `check_every_n_secs=` to change interval, default 10
await crawl.wait_till_done()
# Note: crawl.pages automatically checks if the batch is completed before starting to return elements (can be disabled by passing in `wait_for_completion=False`)
async for page in crawl.pages():
content = await page.retrieve(["html"])
print(f"{page.url}: {len(content.html_content)} bytes")
# Alternative: Direct API access (stateless)
async for page in client.crawl.pages(crawl_id='a_crawl_id'):
content = await page.retrieve(["html"])
print(f"{page.url}: {len(content.html_content)} bytes")
Site Mapping
Copy
# Minimal: Extract all links from a site
sitemap = await client.sitemap("https://example.com")
# Sitemap(id='map_123', urls_count=150, has_more=True)
# Maximal: Advanced link extraction with filters
sitemap = await client.sitemap(
"https://example.com",
search_query="documentation",
top_n=500,
include_subdomain=True,
include_urls=["/docs/**", "/api/**"],
exclude_urls=["/admin/**", "/private/**"]
)
# Seamless iteration over all URLs (auto-pagination)
all_urls = []
async for url in sitemap.urls(): # async generator
print(f"Found URL: {url}")
all_urls.append(url)
# Note: This can yield tens of thousands of URLs. If you can don't
# create a list but use the generator as such.
Data Retrieval
Copy
# Notes:
# * You should generally not need to use this endpoint as the other endpoints generate stateful return objects that can retrieve content.
# * Not all formats are available all the time
# Minimal: Get content by retrieve ID
result = await client.retrieve("ret_123")
# ScrapeResult(id='ret_123', available=[...])
# Maximal: Get multiple formats
result = await client.retrieve("ret_123", ["html", "markdown", "text", "json"])
# ScrapeResult(id='ret_123', available=['html_content', 'markdown_content', 'text_content', 'json_content'])
Advanced Features
Method Shorthands
Copy
# These are equivalent:
await client.scrape("https://example.com") # shorthand
await client.scrape.create("https://example.com") # explicit method
await client.batch(["url1", "url2"]) # shorthand
await client.batch.start(["url1", "url2"]) # explicit method
await client.crawl("https://example.com") # shorthand
await client.crawl.start("https://example.com") # explicit method
await client.sitemap("https://example.com") # shorthand
await client.sitemap.create("https://example.com") # explicit method
await client.retrieve("ret_123") # shorthand
await client.retrieve.get("ret_123") # explicit method
Smart Input Coercion
The SDK intelligently handles various input formats for maximum convenience:Copy
# Formats: string, list, or enum
await client.scrape("https://example.com", formats="html")
await client.scrape("https://example.com", formats=["html", "markdown"])
# Countries: case-insensitive strings or enums
await client.scrape("https://example.com", country="us")
await client.scrape("https://example.com", country=Country.US)
# Lists: single values or lists
await client.batch("https://example.com") # Single URL
await client.batch(["https://a.com", "https://b.com"]) # Multiple URLs
Error Handling
Exception Hierarchy
The SDK handles error detection for you and provides a comprehensive exception hierarchy:Copy
* Olostep_BaseError -------------------------------------- <- Catch base class for all errors
x Olostep_APIConnectionError --------------------------- <- No connection to the API
x OlostepServerError_BaseError ------------------------- <- Server-issued errors (still detected in client ofc)
+ OlostepServerError_TemporaryIssue
- OlostepServerError_NetworkBusy
- OlostepServerError_InternalNetworkIssue
+ OlostepServerError_RequestUnprocessable
- OlostepServerError_ParserNotFound
- OlostepServerError_OutOfResources
+ OlostepServerError_BlacklistedDomain
+ OlostepServerError_FeatureApprovalRequired
+ OlostepServerError_AuthFailed
+ OlostepServerError_CreditsExhausted
+ OlostepServerError_InvalidEndpointCalled
+ OlostepServerError_ResourceNotFound
+ OlostepServerError_NoResultInResponse
+ OlostepServerError_UnknownIssue
x OlostepClientError_BaseError ------------------------- <- Client-issued errors
+ OlostepClientError_RequestValidationFailed
+ OlostepClientError_ResponseValidationFailed
+ OlostepClientError_NoAPIKey
+ OlostepClientError_AsyncContext
+ OlostepClientError_BetaFeatureAccessRequired
+ OlostepClientError_Timeout
Handling Errors
Copy
from olostep import OlostepClient
from olostep.errors import (
Olostep_BaseError,
Olostep_APIConnectionError,
OlostepServerError_AuthFailed,
OlostepClientError_Timeout,
)
client = OlostepClient()
try:
result = await client.scrape("https://example.com")
content = await result.retrieve(["html"])
print(content.html_content)
except Olostep_APIConnectionError:
print("Network error - check your connection")
except OlostepServerError_AuthFailed:
print("Authentication failed - check your API key")
except OlostepClientError_Timeout:
print("Request timed out - try again")
except Olostep_BaseError as e:
print(f"Olostep error: {e}")
The SDK automatically retries failed requests with exponential backoff for transient errors.
Logging
Enable logging to debug issues:Copy
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("olostep")
logger.setLevel(logging.INFO) # Use DEBUG for verbose output
INFO (recommended), DEBUG (verbose), WARNING, ERROR
Configuration
Environment Variables
| Variable | Description | Default |
|---|---|---|
OLOSTEP_API_KEY | Your API key | Required |
OLOSTEP_BASE_API_URL | API base URL | https://api.olostep.com/v1 |
OLOSTEP_API_TIMEOUT | Request timeout (seconds) | 150 |