class Arachnid::Agent

Defined in:

arachnid/agent/sanitizers.cr
arachnid/agent/filters.cr
arachnid/agent/events.cr
arachnid/agent/actions.cr
arachnid/agent/robots.cr
arachnid/agent.cr
arachnid/agent/queue.cr

Constructors

Class Method Summary

Instance Method Summary

Constructor Detail

def self.new(host : String? = nil, read_timeout : Int32? = nil, connect_timeout : Int32? = nil, max_redirects : Int32? = nil, do_not_track : Bool? = nil, default_headers : Hash(String, String)? = nil, host_header : String? = nil, host_headers : Hash(String | Regex, String)? = nil, user_agent : String? = nil, referer : String? = nil, fetch_delay : Int32 | Time::Span? = nil, queue : Hash(String, URI)? = nil, history : Set(URI)? = nil, limit : Int32? = nil, max_depth : Int32? = nil, robots : Bool? = nil, filter_options = nil) #

Creates a new Agent object.


[View source]
def self.new(**options, &block : Agent -> ) #

Create a new scoped Agent in a block.


[View source]

Class Method Detail

def self.host(url, **options, &block : Agent -> ) #

Creates a new Agent and spiders the given host.


[View source]
def self.site(url, **options, &block : Agent -> ) #

Creates a new Agent and spiders the web site located at the given URL.


[View source]
def self.start_at(url, **options, &block : Agent -> ) #

Creates a new Agent and begins spidering at the given URL.


[View source]

Instance Method Detail

def all_headers(&block : HTTP::Headers) #

Pass the headers from every response the agent receives to a given block.


[View source]
def authorized : AuthStore #

HTTP Authentication credentials.


[View source]
def authorized=(authorized : AuthStore) #

HTTP Authentication credentials.


[View source]
def clear #

Clears the history of the Agent.


[View source]
def continue!(&block) #

Continue spidering


[View source]
def cookies : CookieJar #

Cached cookies.


[View source]
def cookies=(cookies : CookieJar) #

Cached cookies.


[View source]
def default_headers : Hash(String, String) #

HTTP Headers to use for every request.


[View source]
def default_headers=(default_headers : Hash(String, String)) #

HTTP Headers to use for every request.


[View source]
def dequeue #

Dequeues a URL that will later be visited.


[View source]
def enqueue(url, level = 0, force = false) #

Enqueues a given URL for visiting, only if it passes all of the agent's rules for visiting a given URL.


[View source]
def every_atom(&block : Resource -> ) #

Pass every Atom feed that the agent visits to a given block.


[View source]
def every_atom_doc(&block : XML::Node -> ) #

Pass every Atom document that the agent parses to a given block.


[View source]
def every_bad_request_page(&block : Resource -> ) #

Pass every Bad Request resource that the agent visits to a given block.


[View source]
def every_content_type(content_type : String | Regex, &block : Resource -> ) #

Passes every resource with a matching content type to the given block.


[View source]
def every_css(&block : Resource -> ) #

Pass every CSS resource that the agent visits to a given block.


[View source]
def every_doc(&block : Document::HTML | XML::Node -> ) #

Pass every HTML or XML document that the agent parses to a given block.


[View source]
def every_failed_url(&block : URI -> ) #

Pass each URL that could not be requested to the given block.


[View source]
def every_forbidden_page(&block : Resource -> ) #

Pass every Forbidden resource that the agent visits to a given block.


[View source]
def every_html_doc(&block : Document::HTML | XML::Node -> ) #

Pass every HTML document that the agent parses to a given block.


[View source]
def every_html_page(&block : Resource -> ) #

Pass every HTML resource that the agent visits to a given block.


[View source]
def every_image(&block : Resource -> ) #

Passes every image resource to the given block.


[View source]
def every_internal_server_error_page(&block : Resource -> ) #

Pass every Internal Server Error resource that the agent visits to a given block.


[View source]
def every_javascript(&block : Resource -> ) #

Pass every JavaScript resource that the agent visits to a given block


[View source]
def every_link(&block : URI, URI -> ) #

Passes every origin and destination URI of each link to a given block.


[View source]
def every_missing_page(&block : Resource -> ) #

Pass every Missing resource that the agent visits to a given block.


[View source]
def every_ms_word(&block : Resource -> ) #

Pass every MS Word resource that the agent visits to a given block.


[View source]
def every_ok_page(&block : Resource -> ) #

Pass every OK resource that the agent visits to a given block.


[View source]
def every_pdf(&block : Resource -> ) #

Pass every PDF resource that the agent visits to a given block.


[View source]
def every_redirect_page(&block : Resource -> ) #

Pass every Redirect resource that the agent visits to a given block.


[View source]
def every_resource(&block : Resource -> ) #

Pass every resource that the agent visits to a given block.


[View source]
def every_rss(&block : Resource -> ) #

Pass every RSS feed that the agent visits to a given block.


[View source]
def every_rss_doc(&block : XML::Node -> ) #

Pass every RSS document that the agent parses to a given block.


[View source]
def every_timedout_page(&block : Resource -> ) #

Pass every Timeout resource that the agent visits to a given block.


[View source]
def every_txt_page(&block : Resource -> ) #

Pass every Plain Text resource that the agent visits to a given block.


[View source]
def every_unauthorized_page(&block : Resource -> ) #

Pass every Unauthorized resource that the agent visits to a given block.


[View source]
def every_url(&block : URI -> ) #

Pass each URL from each resource visited to the given block.


[View source]
def every_url_like(pattern, &block : URI -> ) #

Pass every URL that the agent visits, and matches a given pattern, to a given block.


[View source]
def every_xml_doc(&block : XML::Node -> ) #

Pass every XML document that the agent parses to a given block.


[View source]
def every_xml_page(&block : Resource -> ) #

Pass every XML resource that the agent visits to a given block.


[View source]
def every_xsl_doc(&block : XML::Node -> ) #

Pass every XML Stylesheet (XSL) that the agent parses to a given block.


[View source]
def every_xsl_page(&block : Resource -> ) #

Pass every XML Stylesheet (XSL) resource that the agent visits to a given block.


[View source]
def every_zip(&block : Resource -> ) #

Pass every ZIP resource that the agent visits to a given block.


[View source]
def failed(url) #

Adds a given URL to the failures list.


[View source]
def failed?(url) #

Determines whether a given URL could not be visited.


[View source]
def failures : Set(URI) #

List of unreachable URIs.


[View source]
def failures=(new_failures) #

Sets the list of failed URLs.


[View source]
def fetch_delay : Time::Span | Int32 #

Delay in between fetching resources.


[View source]
def fetch_delay=(fetch_delay : Time::Span | Int32) #

Delay in between fetching resources.


[View source]
def get_resource(url, &block) #

Gets and creates a new Resource object from a given URL, yielding the newly created resource.


[View source]
def get_resource(url) #

Gets and creates a new Resource object from a given URL.


[View source]
def history : Set(URI) #

History containing visited URLs.


[View source]
def history=(new_history) #

Sets the history of URLs that were previously visited.


[View source]
def host : String? #

Set to limit to a single host.


[View source]
def host=(host : String?) #

Set to limit to a single host.


[View source]
def host_header : String? #

HTTP Host Header to use.


[View source]
def host_header=(host_header : String?) #

HTTP Host Header to use.


[View source]
def host_headers : Hash(String | Regex, String) #

HTTP Host Headers to use for specific hosts.


[View source]
def host_headers=(host_headers : Hash(String | Regex, String)) #

HTTP Host Headers to use for specific hosts.


[View source]
def ignore_exts #

Specifies the patterns that match URI path extensions to not visit.


[View source]
def ignore_exts_like(&block : String -> Bool) #

Adds a given pattern to the #ignore_exts.


[View source]
def ignore_exts_like(pattern) #

[View source]
def ignore_hosts #

Specifies the patterns that match host-names to not visit.


[View source]
def ignore_hosts_like(pattern) #

Adds a given pattern to the #ignore_hosts.


[View source]
def ignore_hosts_like(&block) #

[View source]
def ignore_links #

Specifies the patterns that match links to not visit.


[View source]
def ignore_links_like(&block : String -> Bool) #

[View source]
def ignore_links_like(pattern) #

Adds a given pattern to the #ignore_links.


[View source]
def ignore_ports #

Specifies the patterns that match ports to not visit.


[View source]
def ignore_ports_like(pattern) #

Adds a given pattern to the #ignore_ports.


[View source]
def ignore_ports_like(&block : Int32 -> Bool) #

[View source]
def ignore_urls #

Specifies the patterns that match URLs to not visit.


[View source]
def ignore_urls_like(&block : URI -> Bool) #

Adds a given pattern to the #ignore_urls.


[View source]
def ignore_urls_like(pattern) #

[View source]
def initialize_robots #

Initializes the robots filter.


[View source]
def levels : Hash(URI, Int32) #

The visited URLs and their depth within a site.


[View source]
def levels=(levels : Hash(URI, Int32)) #

The visited URLs and their depth within a site.


[View source]
def limit : Int32? #

Maximum number of resources to visit.


[View source]
def limit=(limit : Int32?) #

Maximum number of resources to visit.


[View source]
def limit_reached? #

Determines if the maximum limit has been reached.


[View source]
def max_depth : Int32? #

Maximum depth.


[View source]
def max_depth=(max_depth : Int32?) #

Maximum depth.


[View source]
def pause! #

Pauses the agent, causing spidering to temporarily stop.


[View source]
def pause=(state) #

Sets the pause state of the agent.


[View source]
def paused? #

Determines whether the agent is paused.


[View source]
def post_resource(url, post_data = "") #

Posts supplied form data and creates a new Resource from a given URL.


[View source]
def post_resource(url, post_data = "", &block) #

Posts supplied form data and creates a new Resource from a given URL, yielding the newly created resource.


[View source]
def prepare_request(url, &block) #

Normalizes the request path and grabs a session to handle resource get and post requests.


[View source]
def queue : Hash(String, URI) #

Queue of URLs to visit.


[View source]
def queue=(new_queue) #

Sets the queue of URLs to visit. Sets the list of failed URLs.


[View source]
def queued?(key) #

Determines whether the given URL has been queued for visiting.


[View source]
def referer : String? #

Referer to use.


[View source]
def referer=(referer : String?) #

Referer to use.


[View source]
def run #

Start spidering until the queue becomes empty or the agent is paused.


[View source]
def running? : Bool #

[View source]
def sanitize_url(url) #

Sanitizes a URL based on filtering options


[View source]
def schemes : Array(String) #

List of acceptable URL schemes to follow


[View source]
def schemes=(new_schemes) #

Sets the list of acceptable URL schemes to visit.


[View source]
def sessions : SessionCache #

The session cache.


[View source]
def sessions=(sessions : SessionCache) #

The session cache.


[View source]
def skip_link! #

Causes the agent to skip the link being enqueued.


[View source]
def skip_resource! #

Causes the agent to skip the resource being visited.


[View source]
def start_at(url, force = false) #

Start spidering at a given URL.


[View source]
def strip_fragments=(strip_fragments : Bool) #

Specifies whether the Agent will strip URI fragments


[View source]
def strip_fragments? : Bool #

Specifies whether the Agent will strip URI fragments


[View source]
def strip_query=(strip_query : Bool) #

Specifies whether the Agent will strip URI queries


[View source]
def strip_query? : Bool #

Specifies whether the Agent will strip URI queries


[View source]
def to_h #

Converts the agent into a hash.


[View source]
def urls_like(pattern, &block : URI -> ) #

[View source]
def user_agent : String #

User agent to use.


[View source]
def user_agent=(user_agent : String) #

User agent to use.


[View source]
def visit?(url) #

Determines if a given URL should be visited.


[View source]
def visit_exts #

Specifies the patterns that match the URI path extensions to visit.


[View source]
def visit_exts_like(pattern) #

[View source]
def visit_exts_like(&block : String -> Bool) #

Adds a given pattern to the #visit_exts.


[View source]
def visit_hosts #

Specifies the patterns that match host-names to visit.


[View source]
def visit_hosts_like(&block) #

[View source]
def visit_hosts_like(pattern) #

Adds a given pattern to the #visit_hosts.


[View source]
def visit_links #

Specifies the patterns that match the links to visit.


[View source]
def visit_links_like(pattern) #

Adds a given pattern to the #visit_links


[View source]
def visit_links_like(&block : String -> Bool) #

[View source]
def visit_ports #

Specifies the patterns that match the ports to visit.


[View source]
def visit_ports_like(pattern) #

Adds a given pattern to the #visit_ports.


[View source]
def visit_ports_like(&block : Int32 -> Bool) #

[View source]
def visit_resource(url) #

Visits a given URL and enqueues the links recovered from the resource to be visited later.


[View source]
def visit_urls #

Specifies the patterns that match the URLs to visit.


[View source]
def visit_urls_like(&block : URI -> Bool) #

Adds a given pattern to the #visit_urls


[View source]
def visit_urls_like(pattern) #

[View source]
def visited?(url) #

Determines whether a URL was visited or not.


[View source]
def visited_hosts #

Specifies the hosts which have been visited.


[View source]
def visited_links #

Specifies the links which have been visited.


[View source]