% - -- - - -- - - -- - - -- - - -- - - -- - - -- - -
% ziggy3.erl - an experimental web crawler, version three
% - -- - - -- - - -- - - -- - - -- - - -- - - -- - -
-module(ziggy3).
-author('Alan G. Labouseur').
-define(else, true). % -- This is to make the if statements (somewhat more) readable.
-define(POLITE_PAUSE_INTERVAL, 1000). % -- Milliseconds to wait between making HTTP requests.
%%%
%%% ------------- Public -------------
%%%
-export([start/0, start/1, start/2]).
%%% debugging only: -compile(export_all).
%% No command-line arguements.
start() ->
welcome(),
usage().
%% There are command-line arguements. Deal with them accordingly.
start(Arg) when (Arg == sing) -> io:fwrite(itsPitchDark(), []);
start(_Arg) -> welcome(),
usage().
start(ArgStartUrl, ArgStopAfter) -> % try
welcome(),
crawlInit(ArgStartUrl, ArgStopAfter).
% catch
% _Class:Reason -> io:fwrite("Error: ~s: ~s.~n", [Reason, Args]),
% usage()
% end.
%%%
%%% ------------- Private -------------
%%%
welcome() ->
logo(),
io:fwrite("Welcome to ziggy, a still-experimental web crawler. Ziggy does not play guitar.~n", []).
usage() ->
io:fwrite("Usage:\tziggy:start(url,count) begins crawling at [url] and stops after [count] documents.~n", []),
io:fwrite("\tziggy:start(sing) - Displays a song.~n", []),
io:fwrite("\tziggy:start(help) - Displays this text.~n", []).
logo() ->
io:fwrite(" _____~n",[]),
io:fwrite(" / _ /( ) __ _ __ _ _ _ ~n",[]),
io:fwrite(" \\// / | | / _` | / _` || | | |~n",[]),
io:fwrite(" / //\\| || (_| || (_| || |_| |~n",[]),
io:fwrite(" /____/|_| \\__, | \\__, | \\__, |~n",[]),
io:fwrite(" |___/ |___/ |___/ ~n",[]).
itsPitchDark() -> "You are likely to be eaten by a grue. ~n" ++
"If this predicament seems particularly cruel, ~n" ++
"consider whose fault it could be: ~n" ++
"not a torch or a match in your inventory. ~n" ++
" - MC Frontalot~n".
crawlInit(Url, Limit) ->
io:fwrite("Starting inets application.~n", []),
application:start(inets), % TODO: Don't do this every time; just when we need to.
io:fwrite("Beginning crawl at ~p.~n", [Url]),
CrawlHistory = crawl(Url, [], Limit),
% Write (and possibly return if anybody is listening) our exit message and crawl history.
Msg = "Crawling compelte.",
io:fwrite("~p~n~n", [Msg]),
{Msg, CrawlHistory}.
crawl(Url, CrawlHistory, Limit) ->
if length(CrawlHistory) >= Limit ->
% Write to the console.
io:fwrite("Crawl History:~p~n", [CrawlHistory]),
io:fwrite("We've reached our crawl limit of ~p. Stopping. Killing process. Exiting.~n~n", [Limit]),
% Exit without being "caught" by our catch statements.
exit(self(), kill);
% This is klunky, but it's needed because Erlang won't let me pass a value out of a try block
% (which would let me reduce the scope of the try..catch and avoid this), which is klunky on its part. So there.
?else ->
BaseUrl = normalizeUrl(Url),
RepeatedUrl = lists:member(BaseUrl,CrawlHistory),
if RepeatedUrl ->
io:fwrite("Skipping ~p; already crawled it.~n~n", [BaseUrl]),
CrawlHistory;
?else ->
io:fwrite("Crawl History:~p~n", [CrawlHistory]),
io:fwrite("~n-- Pausing for reflection and polite bandwidth sharing.--~n~n", []),
timer:sleep(?POLITE_PAUSE_INTERVAL), % Pause for reflection and polite use of bandwidth.
doCrawl(BaseUrl, [BaseUrl | CrawlHistory], Limit) % "Cons" the BaseUrl on to the CrawlHistory so we remember it.
end
end.
doCrawl(BaseUrl, CrawlHistory, Limit) ->
io:fwrite("Crawling ~p.~n", [BaseUrl]),
try
% ...to send an HTTP request to the Url asking for the content at that address.
{ok, {{_HttpVer, _Code, _Msg}, _Headers, Body}} = httpc:request(get, {BaseUrl, []}, [], [{sync,true}, {body_format,string}]),
% Extract all of the hypertext references (hrefs) in the document.
HrefsList = getHrefsFrom(Body, stemUrl(BaseUrl)),
io:fwrite("I found the following links at ~p:~n~p~n~n", [BaseUrl, HrefsList]),
% Store the body (in a .ziggy file.)
storeBody(Body, length(CrawlHistory)),
% Lastly, recursively crawl all of the hrefs in that list.
crawlNewHrefs(HrefsList, CrawlHistory, Limit)
catch
% ... anything and everything that might go wrong getting the HTTP Response (and there's a lot that could go wrong!).
_:_ -> io:fwrite("Warning. Error caught in doCrawl(): ~p~n", [ziggy_error_http_response])
end.
crawlNewHrefs([], CrawlHistory, _Limit) -> CrawlHistory;
crawlNewHrefs([Url | Rest], CrawlHistory, Limit) ->
% Recurse back to crawl() so that we always check the crawlHistory.
History = crawl(Url, CrawlHistory, Limit),
crawlNewHrefs(Rest, History, Limit).
getHrefsFrom(Body, StemUrl) ->
% Before we can get hrefs, we need a list of the ... anchor tags.
% First find their positions, we must.
AnchorTagPositionsRegExResult = getAnchorTagPositions(Body),
% Then we either do or do not get the hrefs. (There is no try.)
processAnchorTagPositions(AnchorTagPositionsRegExResult, Body, StemUrl).
getAnchorTagPositions(Body) ->
% Define a regular expression for the opening anchor tag.
% We don't (yet) care about the end unless we want to grab the link description, which would be cool eventually.
AnchorElementRegEx = "<[a|A] [^>]*>", % We need that space in there so that we don't match the
, , ,