class Arachnid::Agent

Arachnid::Agent
Reference
Object

Defined in:

arachnid/agent/sanitizers.cr
arachnid/agent/filters.cr
arachnid/agent/events.cr
arachnid/agent/actions.cr
arachnid/agent/robots.cr
arachnid/agent.cr
arachnid/agent/queue.cr

Constructors

.new(host : String? = nil, read_timeout : Int32? = nil, connect_timeout : Int32? = nil, max_redirects : Int32? = nil, do_not_track : Bool? = nil, default_headers : Hash(String, String)? = nil, host_header : String? = nil, host_headers : Hash(String | Regex, String)? = nil, user_agent : String? = nil, referer : String? = nil, fetch_delay : Int32 | Time::Span? = nil, queue : Hash(String, URI)? = nil, history : Set(URI)? = nil, limit : Int32? = nil, max_depth : Int32? = nil, robots : Bool? = nil, filter_options = nil)
Creates a new Agent object.
.new(**options, &block : Agent -> )
Create a new scoped Agent in a block.

Class Method Summary

.host(url, **options, &block : Agent -> )
Creates a new Agent and spiders the given host.
.site(url, **options, &block : Agent -> )
Creates a new Agent and spiders the web site located at the given URL.
.start_at(url, **options, &block : Agent -> )
Creates a new Agent and begins spidering at the given URL.

Instance Method Summary

#all_headers(&block : HTTP::Headers)
Pass the headers from every response the agent receives to a given block.
#authorized : AuthStore
HTTP Authentication credentials.
#authorized=(authorized : AuthStore)
HTTP Authentication credentials.
#clear
Clears the history of the Agent.
#continue!(&block)
Continue spidering
#cookies : CookieJar
Cached cookies.
#cookies=(cookies : CookieJar)
Cached cookies.
#default_headers : Hash(String, String)
HTTP Headers to use for every request.
#default_headers=(default_headers : Hash(String, String))
HTTP Headers to use for every request.
#dequeue
Dequeues a URL that will later be visited.
#enqueue(url, level = 0, force = false)
Enqueues a given URL for visiting, only if it passes all of the agent's rules for visiting a given URL.
#every_atom(&block : Resource -> )
Pass every Atom feed that the agent visits to a given block.
#every_atom_doc(&block : XML::Node -> )
Pass every Atom document that the agent parses to a given block.
#every_bad_request_page(&block : Resource -> )
Pass every Bad Request resource that the agent visits to a given block.
#every_content_type(content_type : String | Regex, &block : Resource -> )
Passes every resource with a matching content type to the given block.
#every_css(&block : Resource -> )
Pass every CSS resource that the agent visits to a given block.
#every_doc(&block : Document::HTML | XML::Node -> )
Pass every HTML or XML document that the agent parses to a given block.
#every_failed_url(&block : URI -> )
Pass each URL that could not be requested to the given block.
#every_forbidden_page(&block : Resource -> )
Pass every Forbidden resource that the agent visits to a given block.
#every_html_doc(&block : Document::HTML | XML::Node -> )
Pass every HTML document that the agent parses to a given block.
#every_html_page(&block : Resource -> )
Pass every HTML resource that the agent visits to a given block.
#every_image(&block : Resource -> )
Passes every image resource to the given block.
#every_internal_server_error_page(&block : Resource -> )
Pass every Internal Server Error resource that the agent visits to a given block.
#every_javascript(&block : Resource -> )
Pass every JavaScript resource that the agent visits to a given block
#every_link(&block : URI, URI -> )
Passes every origin and destination URI of each link to a given block.
#every_missing_page(&block : Resource -> )
Pass every Missing resource that the agent visits to a given block.
#every_ms_word(&block : Resource -> )
Pass every MS Word resource that the agent visits to a given block.
#every_ok_page(&block : Resource -> )
Pass every OK resource that the agent visits to a given block.
#every_pdf(&block : Resource -> )
Pass every PDF resource that the agent visits to a given block.
#every_redirect_page(&block : Resource -> )
Pass every Redirect resource that the agent visits to a given block.
#every_resource(&block : Resource -> )
Pass every resource that the agent visits to a given block.
#every_rss(&block : Resource -> )
Pass every RSS feed that the agent visits to a given block.
#every_rss_doc(&block : XML::Node -> )
Pass every RSS document that the agent parses to a given block.
#every_timedout_page(&block : Resource -> )
Pass every Timeout resource that the agent visits to a given block.
#every_txt_page(&block : Resource -> )
Pass every Plain Text resource that the agent visits to a given block.
#every_unauthorized_page(&block : Resource -> )
Pass every Unauthorized resource that the agent visits to a given block.
#every_url(&block : URI -> )
Pass each URL from each resource visited to the given block.
#every_url_like(pattern, &block : URI -> )
Pass every URL that the agent visits, and matches a given pattern, to a given block.
#every_xml_doc(&block : XML::Node -> )
Pass every XML document that the agent parses to a given block.
#every_xml_page(&block : Resource -> )
Pass every XML resource that the agent visits to a given block.
#every_xsl_doc(&block : XML::Node -> )
Pass every XML Stylesheet (XSL) that the agent parses to a given block.
#every_xsl_page(&block : Resource -> )
Pass every XML Stylesheet (XSL) resource that the agent visits to a given block.
#every_zip(&block : Resource -> )
Pass every ZIP resource that the agent visits to a given block.
#failed(url)
Adds a given URL to the failures list.
#failed?(url)
Determines whether a given URL could not be visited.
#failures : Set(URI)
List of unreachable URIs.
#failures=(new_failures)
Sets the list of failed URLs.
#fetch_delay : Time::Span | Int32
Delay in between fetching resources.
#fetch_delay=(fetch_delay : Time::Span | Int32)
Delay in between fetching resources.
#get_resource(url, &block)
Gets and creates a new Resource object from a given URL, yielding the newly created resource.
#get_resource(url)
Gets and creates a new Resource object from a given URL.
#history : Set(URI)
History containing visited URLs.
#history=(new_history)
Sets the history of URLs that were previously visited.
#host : String?
Set to limit to a single host.
#host=(host : String?)
Set to limit to a single host.
#host_header : String?
HTTP Host Header to use.
#host_header=(host_header : String?)
HTTP Host Header to use.
#host_headers : Hash(String | Regex, String)
HTTP Host Headers to use for specific hosts.
#host_headers=(host_headers : Hash(String | Regex, String))
HTTP Host Headers to use for specific hosts.
#ignore_exts
Specifies the patterns that match URI path extensions to not visit.
#ignore_exts_like(&block : String -> Bool)
Adds a given pattern to the #ignore_exts.
#ignore_exts_like(pattern)
#ignore_hosts
Specifies the patterns that match host-names to not visit.
#ignore_hosts_like(pattern)
Adds a given pattern to the #ignore_hosts.
#ignore_hosts_like(&block)
#ignore_links
Specifies the patterns that match links to not visit.
#ignore_links_like(&block : String -> Bool)
#ignore_links_like(pattern)
Adds a given pattern to the #ignore_links.
#ignore_ports
Specifies the patterns that match ports to not visit.
#ignore_ports_like(pattern)
Adds a given pattern to the #ignore_ports.
#ignore_ports_like(&block : Int32 -> Bool)
#ignore_urls
Specifies the patterns that match URLs to not visit.
#ignore_urls_like(&block : URI -> Bool)
Adds a given pattern to the #ignore_urls.
#ignore_urls_like(pattern)
#initialize_robots
Initializes the robots filter.
#levels : Hash(URI, Int32)
The visited URLs and their depth within a site.
#levels=(levels : Hash(URI, Int32))
The visited URLs and their depth within a site.
#limit : Int32?
Maximum number of resources to visit.
#limit=(limit : Int32?)
Maximum number of resources to visit.
#limit_reached?
Determines if the maximum limit has been reached.
#max_depth : Int32?
Maximum depth.
#max_depth=(max_depth : Int32?)
Maximum depth.
#pause!
Pauses the agent, causing spidering to temporarily stop.
#pause=(state)
Sets the pause state of the agent.
#paused?
Determines whether the agent is paused.
#post_resource(url, post_data = "")
Posts supplied form data and creates a new Resource from a given URL.
#post_resource(url, post_data = "", &block)
Posts supplied form data and creates a new Resource from a given URL, yielding the newly created resource.
#prepare_request(url, &block)
Normalizes the request path and grabs a session to handle resource get and post requests.
#queue : Hash(String, URI)
Queue of URLs to visit.
#queue=(new_queue)
Sets the queue of URLs to visit.
#queued?(key)
Determines whether the given URL has been queued for visiting.
#referer : String?
Referer to use.
#referer=(referer : String?)
Referer to use.
#run
Start spidering until the queue becomes empty or the agent is paused.
#running? : Bool
#sanitize_url(url)
Sanitizes a URL based on filtering options
#schemes : Array(String)
List of acceptable URL schemes to follow
#schemes=(new_schemes)
Sets the list of acceptable URL schemes to visit.
#sessions : SessionCache
The session cache.
#sessions=(sessions : SessionCache)
The session cache.
#skip_link!
Causes the agent to skip the link being enqueued.
#skip_resource!
Causes the agent to skip the resource being visited.
#start_at(url, force = false)
Start spidering at a given URL.
#strip_fragments=(strip_fragments : Bool)
Specifies whether the Agent will strip URI fragments
#strip_fragments? : Bool
Specifies whether the Agent will strip URI fragments
#strip_query=(strip_query : Bool)
Specifies whether the Agent will strip URI queries
#strip_query? : Bool
Specifies whether the Agent will strip URI queries
#to_h
Converts the agent into a hash.
#urls_like(pattern, &block : URI -> )
Ssee #every_url_like
#user_agent : String
User agent to use.
#user_agent=(user_agent : String)
User agent to use.
#visit?(url)
Determines if a given URL should be visited.
#visit_exts
Specifies the patterns that match the URI path extensions to visit.
#visit_exts_like(pattern)
#visit_exts_like(&block : String -> Bool)
Adds a given pattern to the #visit_exts.
#visit_hosts
Specifies the patterns that match host-names to visit.
#visit_hosts_like(&block)
#visit_hosts_like(pattern)
Adds a given pattern to the #visit_hosts.
#visit_links
Specifies the patterns that match the links to visit.
#visit_links_like(pattern)
Adds a given pattern to the #visit_links
#visit_links_like(&block : String -> Bool)
#visit_ports
Specifies the patterns that match the ports to visit.
#visit_ports_like(pattern)
Adds a given pattern to the #visit_ports.
#visit_ports_like(&block : Int32 -> Bool)
#visit_resource(url)
Visits a given URL and enqueues the links recovered from the resource to be visited later.
#visit_urls
Specifies the patterns that match the URLs to visit.
#visit_urls_like(&block : URI -> Bool)
Adds a given pattern to the #visit_urls
#visit_urls_like(pattern)
#visited?(url)
Determines whether a URL was visited or not.
#visited_hosts
Specifies the hosts which have been visited.
#visited_links
Specifies the links which have been visited.

Constructor Detail

def self.new(host : String? = nil, read_timeout : Int32? = nil, connect_timeout : Int32? = nil, max_redirects : Int32? = nil, do_not_track : Bool? = nil, default_headers : Hash(String, String)? = nil, host_header : String? = nil, host_headers : Hash(String | Regex, String)? = nil, user_agent : String? = nil, referer : String? = nil, fetch_delay : Int32 | Time::Span? = nil, queue : Hash(String, URI)? = nil, history : Set(URI)? = nil, limit : Int32? = nil, max_depth : Int32? = nil, robots : Bool? = nil, filter_options = nil) #

Creates a new Agent object.

[View source]

def self.new(**options, &block : Agent -> ) #

Create a new scoped Agent in a block.

[View source]

Class Method Detail

def self.host(url, **options, &block : Agent -> ) #

Creates a new Agent and spiders the given host.

[View source]

def self.site(url, **options, &block : Agent -> ) #

Creates a new Agent and spiders the web site located at the given URL.

[View source]

def self.start_at(url, **options, &block : Agent -> ) #

Creates a new Agent and begins spidering at the given URL.

[View source]

Instance Method Detail

def all_headers(&block : HTTP::Headers) #

Pass the headers from every response the agent receives to a given block.

[View source]

def authorized : AuthStore #

HTTP Authentication credentials.

[View source]

def authorized=(authorized : AuthStore) #

HTTP Authentication credentials.

[View source]

def clear #

Clears the history of the Agent.

[View source]

def continue!(&block) #

Continue spidering

[View source]

def cookies : CookieJar #

Cached cookies.

[View source]

def cookies=(cookies : CookieJar) #

Cached cookies.

[View source]

def default_headers : Hash(String, String) #

HTTP Headers to use for every request.

[View source]

def default_headers=(default_headers : Hash(String, String)) #

HTTP Headers to use for every request.

[View source]

def dequeue #

Dequeues a URL that will later be visited.

[View source]

def enqueue(url, level = 0, force = false) #

Enqueues a given URL for visiting, only if it passes all of the agent's rules for visiting a given URL.

[View source]

def every_atom(&block : Resource -> ) #

Pass every Atom feed that the agent visits to a given block.

[View source]

def every_atom_doc(&block : XML::Node -> ) #

Pass every Atom document that the agent parses to a given block.

[View source]

def every_bad_request_page(&block : Resource -> ) #

Pass every Bad Request resource that the agent visits to a given block.

[View source]

def every_content_type(content_type : String | Regex, &block : Resource -> ) #

Passes every resource with a matching content type to the given block.

[View source]

def every_css(&block : Resource -> ) #

Pass every CSS resource that the agent visits to a given block.

[View source]

def every_doc(&block : Document::HTML | XML::Node -> ) #

Pass every HTML or XML document that the agent parses to a given block.

[View source]

def every_failed_url(&block : URI -> ) #

Pass each URL that could not be requested to the given block.

[View source]

def every_forbidden_page(&block : Resource -> ) #

Pass every Forbidden resource that the agent visits to a given block.

[View source]

def every_html_doc(&block : Document::HTML | XML::Node -> ) #

Pass every HTML document that the agent parses to a given block.

[View source]

def every_html_page(&block : Resource -> ) #

Pass every HTML resource that the agent visits to a given block.

[View source]

def every_image(&block : Resource -> ) #

Passes every image resource to the given block.

[View source]

def every_internal_server_error_page(&block : Resource -> ) #

Pass every Internal Server Error resource that the agent visits to a given block.

[View source]

def every_javascript(&block : Resource -> ) #

Pass every JavaScript resource that the agent visits to a given block

[View source]

def every_link(&block : URI, URI -> ) #

Passes every origin and destination URI of each link to a given block.

[View source]

def every_missing_page(&block : Resource -> ) #

Pass every Missing resource that the agent visits to a given block.

[View source]

def every_ms_word(&block : Resource -> ) #

Pass every MS Word resource that the agent visits to a given block.

[View source]

def every_ok_page(&block : Resource -> ) #

Pass every OK resource that the agent visits to a given block.

[View source]

def every_pdf(&block : Resource -> ) #

Pass every PDF resource that the agent visits to a given block.

[View source]

def every_redirect_page(&block : Resource -> ) #

Pass every Redirect resource that the agent visits to a given block.

[View source]

def every_resource(&block : Resource -> ) #

Pass every resource that the agent visits to a given block.

[View source]

def every_rss(&block : Resource -> ) #

Pass every RSS feed that the agent visits to a given block.

[View source]

def every_rss_doc(&block : XML::Node -> ) #

Pass every RSS document that the agent parses to a given block.

[View source]

def every_timedout_page(&block : Resource -> ) #

Pass every Timeout resource that the agent visits to a given block.

[View source]

def every_txt_page(&block : Resource -> ) #

Pass every Plain Text resource that the agent visits to a given block.

[View source]

def every_unauthorized_page(&block : Resource -> ) #

Pass every Unauthorized resource that the agent visits to a given block.

[View source]

def every_url(&block : URI -> ) #

Pass each URL from each resource visited to the given block.

[View source]

def every_url_like(pattern, &block : URI -> ) #

Pass every URL that the agent visits, and matches a given pattern, to a given block.

[View source]

def every_xml_doc(&block : XML::Node -> ) #

Pass every XML document that the agent parses to a given block.

[View source]

def every_xml_page(&block : Resource -> ) #

Pass every XML resource that the agent visits to a given block.

[View source]

def every_xsl_doc(&block : XML::Node -> ) #

Pass every XML Stylesheet (XSL) that the agent parses to a given block.

[View source]

def every_xsl_page(&block : Resource -> ) #

Pass every XML Stylesheet (XSL) resource that the agent visits to a given block.

[View source]

def every_zip(&block : Resource -> ) #

Pass every ZIP resource that the agent visits to a given block.

[View source]

def failed(url) #

Adds a given URL to the failures list.

[View source]

def failed?(url) #

Determines whether a given URL could not be visited.

[View source]

def failures : Set(URI) #

List of unreachable URIs.

[View source]

def failures=(new_failures) #

Sets the list of failed URLs.

[View source]

def fetch_delay : Time::Span | Int32 #

Delay in between fetching resources.

[View source]

def fetch_delay=(fetch_delay : Time::Span | Int32) #

Delay in between fetching resources.

[View source]

def get_resource(url, &block) #

Gets and creates a new Resource object from a given URL, yielding the newly created resource.

[View source]

def get_resource(url) #

Gets and creates a new Resource object from a given URL.

[View source]

def history : Set(URI) #

History containing visited URLs.

[View source]

def history=(new_history) #

Sets the history of URLs that were previously visited.

[View source]

def host : String? #

Set to limit to a single host.

[View source]

def host=(host : String?) #

Set to limit to a single host.

[View source]

def host_header : String? #

HTTP Host Header to use.

[View source]

def host_header=(host_header : String?) #

HTTP Host Header to use.

[View source]

def host_headers : Hash(String | Regex, String) #

HTTP Host Headers to use for specific hosts.

[View source]

def host_headers=(host_headers : Hash(String | Regex, String)) #

HTTP Host Headers to use for specific hosts.

[View source]

def ignore_exts #

Specifies the patterns that match URI path extensions to not visit.

[View source]

def ignore_exts_like(&block : String -> Bool) #

Adds a given pattern to the #ignore_exts.

[View source]

def ignore_exts_like(pattern) #

[View source]

def ignore_hosts #

Specifies the patterns that match host-names to not visit.

[View source]

def ignore_hosts_like(pattern) #

Adds a given pattern to the #ignore_hosts.

[View source]

def ignore_hosts_like(&block) #

[View source]

def ignore_links #

Specifies the patterns that match links to not visit.

[View source]

def ignore_links_like(&block : String -> Bool) #

[View source]

def ignore_links_like(pattern) #

Adds a given pattern to the #ignore_links.

[View source]

def ignore_ports #

Specifies the patterns that match ports to not visit.

[View source]

def ignore_ports_like(pattern) #

Adds a given pattern to the #ignore_ports.

[View source]

def ignore_ports_like(&block : Int32 -> Bool) #

[View source]

def ignore_urls #

Specifies the patterns that match URLs to not visit.

[View source]

def ignore_urls_like(&block : URI -> Bool) #

Adds a given pattern to the #ignore_urls.

[View source]

def ignore_urls_like(pattern) #

[View source]

def initialize_robots #

Initializes the robots filter.

[View source]

def levels : Hash(URI, Int32) #

The visited URLs and their depth within a site.

[View source]

def levels=(levels : Hash(URI, Int32)) #

The visited URLs and their depth within a site.

[View source]

def limit : Int32? #

Maximum number of resources to visit.

[View source]

def limit=(limit : Int32?) #

Maximum number of resources to visit.

[View source]

def limit_reached? #

Determines if the maximum limit has been reached.

[View source]

def max_depth : Int32? #

Maximum depth.

[View source]

def max_depth=(max_depth : Int32?) #

Maximum depth.

[View source]

def pause! #

Pauses the agent, causing spidering to temporarily stop.

[View source]

def pause=(state) #

Sets the pause state of the agent.

[View source]

def paused? #

Determines whether the agent is paused.

[View source]

def post_resource(url, post_data = "") #

Posts supplied form data and creates a new Resource from a given URL.

[View source]

def post_resource(url, post_data = "", &block) #

Posts supplied form data and creates a new Resource from a given URL, yielding the newly created resource.

[View source]

def prepare_request(url, &block) #

Normalizes the request path and grabs a session to handle resource get and post requests.

[View source]

def queue : Hash(String, URI) #

Queue of URLs to visit.

[View source]

def queue=(new_queue) #

Sets the queue of URLs to visit. Sets the list of failed URLs.

[View source]

def queued?(key) #

Determines whether the given URL has been queued for visiting.

[View source]

def referer : String? #

Referer to use.

[View source]

def referer=(referer : String?) #

Referer to use.

[View source]

def run #

Start spidering until the queue becomes empty or the agent is paused.

[View source]

def running? : Bool #

[View source]

def sanitize_url(url) #

Sanitizes a URL based on filtering options

[View source]

def schemes : Array(String) #

List of acceptable URL schemes to follow

[View source]

def schemes=(new_schemes) #

Sets the list of acceptable URL schemes to visit.

[View source]

def sessions : SessionCache #

The session cache.

[View source]

def sessions=(sessions : SessionCache) #

The session cache.

[View source]

def skip_link! #

Causes the agent to skip the link being enqueued.

[View source]

def skip_resource! #

Causes the agent to skip the resource being visited.

[View source]

def start_at(url, force = false) #

Start spidering at a given URL.

[View source]

def strip_fragments=(strip_fragments : Bool) #

Specifies whether the Agent will strip URI fragments

[View source]

def strip_fragments? : Bool #

Specifies whether the Agent will strip URI fragments

[View source]

def strip_query=(strip_query : Bool) #

Specifies whether the Agent will strip URI queries

[View source]

def strip_query? : Bool #

Specifies whether the Agent will strip URI queries

[View source]

def to_h #

Converts the agent into a hash.

[View source]

def urls_like(pattern, &block : URI -> ) #

Ssee #every_url_like

[View source]

def user_agent : String #

User agent to use.

[View source]

def user_agent=(user_agent : String) #

User agent to use.

[View source]

def visit?(url) #

Determines if a given URL should be visited.

[View source]

def visit_exts #

Specifies the patterns that match the URI path extensions to visit.

[View source]

def visit_exts_like(pattern) #

[View source]

def visit_exts_like(&block : String -> Bool) #

Adds a given pattern to the #visit_exts.

[View source]

def visit_hosts #

Specifies the patterns that match host-names to visit.

[View source]

def visit_hosts_like(&block) #

[View source]

def visit_hosts_like(pattern) #

Adds a given pattern to the #visit_hosts.

[View source]

def visit_links #

Specifies the patterns that match the links to visit.

[View source]

def visit_links_like(pattern) #

Adds a given pattern to the #visit_links

[View source]

def visit_links_like(&block : String -> Bool) #

[View source]

def visit_ports #

Specifies the patterns that match the ports to visit.

[View source]

def visit_ports_like(pattern) #

Adds a given pattern to the #visit_ports.

[View source]

def visit_ports_like(&block : Int32 -> Bool) #

[View source]

def visit_resource(url) #

Visits a given URL and enqueues the links recovered from the resource to be visited later.

[View source]

def visit_urls #

Specifies the patterns that match the URLs to visit.

[View source]

def visit_urls_like(&block : URI -> Bool) #

Adds a given pattern to the #visit_urls

[View source]

def visit_urls_like(pattern) #

[View source]

def visited?(url) #

Determines whether a URL was visited or not.

[View source]

def visited_hosts #

Specifies the hosts which have been visited.

[View source]

def visited_links #

Specifies the links which have been visited.

[View source]