% - -- - - -- - - -- - - -- - - -- - - -- - - -- - - % ziggy3.erl - an experimental web crawler, version three % - -- - - -- - - -- - - -- - - -- - - -- - - -- - - -module(ziggy3). -author('Alan G. Labouseur'). -define(else, true). % -- This is to make the if statements (somewhat more) readable. -define(POLITE_PAUSE_INTERVAL, 1000). % -- Milliseconds to wait between making HTTP requests. %%% %%% ------------- Public ------------- %%% -export([start/0, start/1, start/2]). %%% debugging only: -compile(export_all). %% No command-line arguements. start() -> welcome(), usage(). %% There are command-line arguements. Deal with them accordingly. start(Arg) when (Arg == sing) -> io:fwrite(itsPitchDark(), []); start(_Arg) -> welcome(), usage(). start(ArgStartUrl, ArgStopAfter) -> % try welcome(), crawlInit(ArgStartUrl, ArgStopAfter). % catch % _Class:Reason -> io:fwrite("Error: ~s: ~s.~n", [Reason, Args]), % usage() % end. %%% %%% ------------- Private ------------- %%% welcome() -> logo(), io:fwrite("Welcome to ziggy, a still-experimental web crawler. Ziggy does not play guitar.~n", []). usage() -> io:fwrite("Usage:\tziggy:start(url,count) begins crawling at [url] and stops after [count] documents.~n", []), io:fwrite("\tziggy:start(sing) - Displays a song.~n", []), io:fwrite("\tziggy:start(help) - Displays this text.~n", []). logo() -> io:fwrite(" _____~n",[]), io:fwrite(" / _ /( ) __ _ __ _ _ _ ~n",[]), io:fwrite(" \\// / | | / _` | / _` || | | |~n",[]), io:fwrite(" / //\\| || (_| || (_| || |_| |~n",[]), io:fwrite(" /____/|_| \\__, | \\__, | \\__, |~n",[]), io:fwrite(" |___/ |___/ |___/ ~n",[]). itsPitchDark() -> "You are likely to be eaten by a grue. ~n" ++ "If this predicament seems particularly cruel, ~n" ++ "consider whose fault it could be: ~n" ++ "not a torch or a match in your inventory. ~n" ++ " - MC Frontalot~n". crawlInit(Url, Limit) -> io:fwrite("Starting inets application.~n", []), application:start(inets), % TODO: Don't do this every time; just when we need to. io:fwrite("Beginning crawl at ~p.~n", [Url]), CrawlHistory = crawl(Url, [], Limit), % Write (and possibly return if anybody is listening) our exit message and crawl history. Msg = "Crawling compelte.", io:fwrite("~p~n~n", [Msg]), {Msg, CrawlHistory}. crawl(Url, CrawlHistory, Limit) -> if length(CrawlHistory) >= Limit -> % Write to the console. io:fwrite("Crawl History:~p~n", [CrawlHistory]), io:fwrite("We've reached our crawl limit of ~p. Stopping. Killing process. Exiting.~n~n", [Limit]), % Exit without being "caught" by our catch statements. exit(self(), kill); % This is klunky, but it's needed because Erlang won't let me pass a value out of a try block % (which would let me reduce the scope of the try..catch and avoid this), which is klunky on its part. So there. ?else -> BaseUrl = normalizeUrl(Url), RepeatedUrl = lists:member(BaseUrl,CrawlHistory), if RepeatedUrl -> io:fwrite("Skipping ~p; already crawled it.~n~n", [BaseUrl]), CrawlHistory; ?else -> io:fwrite("Crawl History:~p~n", [CrawlHistory]), io:fwrite("~n-- Pausing for reflection and polite bandwidth sharing.--~n~n", []), timer:sleep(?POLITE_PAUSE_INTERVAL), % Pause for reflection and polite use of bandwidth. doCrawl(BaseUrl, [BaseUrl | CrawlHistory], Limit) % "Cons" the BaseUrl on to the CrawlHistory so we remember it. end end. doCrawl(BaseUrl, CrawlHistory, Limit) -> io:fwrite("Crawling ~p.~n", [BaseUrl]), try % ...to send an HTTP request to the Url asking for the content at that address. {ok, {{_HttpVer, _Code, _Msg}, _Headers, Body}} = httpc:request(get, {BaseUrl, []}, [], [{sync,true}, {body_format,string}]), % Extract all of the hypertext references (hrefs) in the document. HrefsList = getHrefsFrom(Body, stemUrl(BaseUrl)), io:fwrite("I found the following links at ~p:~n~p~n~n", [BaseUrl, HrefsList]), % Store the body (in a .ziggy file.) storeBody(Body, length(CrawlHistory)), % Lastly, recursively crawl all of the hrefs in that list. crawlNewHrefs(HrefsList, CrawlHistory, Limit) catch % ... anything and everything that might go wrong getting the HTTP Response (and there's a lot that could go wrong!). _:_ -> io:fwrite("Warning. Error caught in doCrawl(): ~p~n", [ziggy_error_http_response]) end. crawlNewHrefs([], CrawlHistory, _Limit) -> CrawlHistory; crawlNewHrefs([Url | Rest], CrawlHistory, Limit) -> % Recurse back to crawl() so that we always check the crawlHistory. History = crawl(Url, CrawlHistory, Limit), crawlNewHrefs(Rest, History, Limit). getHrefsFrom(Body, StemUrl) -> % Before we can get hrefs, we need a list of the ... anchor tags. % First find their positions, we must. AnchorTagPositionsRegExResult = getAnchorTagPositions(Body), % Then we either do or do not get the hrefs. (There is no try.) processAnchorTagPositions(AnchorTagPositionsRegExResult, Body, StemUrl). getAnchorTagPositions(Body) -> % Define a regular expression for the opening anchor tag. % We don't (yet) care about the end unless we want to grab the link description, which would be cool eventually. AnchorElementRegEx = "<[a|A] [^>]*>", % We need that space in there so that we don't match the
, , , , or tags. % Run the regex (re) against the (HTML document) body to get % a list of {start, length} position tuples for all anchor tags % or the atom nomatch is there were none. re:run(Body, AnchorElementRegEx, [global]). processAnchorTagPositions(nomatch, _Body, _StemUrl) -> % RegEx returned NO matches for anchor tags. Return an empty list. []; processAnchorTagPositions({match, AnchorPositionsList}, Body, StemUrl) -> % We have some RegEx matches. Extract the anchor tags from the body. AnchorTagsList = extractAnchorTags(Body, AnchorPositionsList), % Now extract the hrefs from the anchor tags (and return the list). extractHrefs(AnchorTagsList, StemUrl). extractAnchorTags(_Body, []) -> []; extractAnchorTags(Body, [AnchorPositionsTuple | Rest]) -> [{StartPos, Length}] = AnchorPositionsTuple, AnchorText = string:sub_string(Body, StartPos+1, StartPos+Length), % Recursively build the result list. [AnchorText | extractAnchorTags(Body, Rest)]. extractHrefs([], _StemUrl) -> []; extractHrefs([AnchorTag | Rest], StemUrl) -> try % Grab only the portion of the anchor tag from the beginning of the url part to the end of that string. HrefPosition = string:str(AnchorTag, "href="), % TODO: Handle spaces in the href tag. And watch out for mailto:. % The href might begin with a single or double quote, or neither if the HTML is bad (and there's a lot of that on the web - doesn't anybody use the W3C HTML validator?). AfterEqualsInHref = string:substr(AnchorTag, HrefPosition+5, 1), if (AfterEqualsInHref == "\"") or (AfterEqualsInHref == "\'") -> % There's a single- or double-quote: href="blah... or href='blah... Remove everything up to that " or '. Pass1 = string:sub_string(AnchorTag, HrefPosition+6), % Now take only the stuff up to the first remaining double- or single-quote, since % everythang else is past the end of the url portion of the href and we don't (yet) care about it. SingleQuotePos = string:str(Pass1, "\'"), DoubleQuotePos = string:str(Pass1, "\""), if (SingleQuotePos > 0) and (DoubleQuotePos > 0) -> % There are both single- and double-quotes left. Take the nearest. QuotePos = min(SingleQuotePos,DoubleQuotePos); ?else -> % There is only a single- or double-quote left. Take the furthest, because the other is 0. QuotePos = max(SingleQuotePos,DoubleQuotePos) end, Pass2 = string:sub_string(Pass1, 1, QuotePos-1); ?else -> % There's NO quote: href=blah... Remove everything up to the =. Pass1 = string:sub_string(AnchorTag, HrefPosition+5), % It's mal-formed. I guess we can "just punt" and take up to the first space. SpacePos = string:str(AnchorTag, " "), Pass2 = string:sub_string(Pass1, 1, SpacePos-1) end, % Make sure that we are starting with http:// and the base address of the web site so that we can handle links like /about/ and such. Href = addFront(StemUrl, Pass2), % Recursively build the result list. [Href | extractHrefs(Rest, StemUrl)] catch % Catch anything and everything that might go wrong up there. _:_ -> io:fwrite("Warning. Error caught in extractHrefs(): ~p~n", [ziggy_error_href]), % Recursively call for the REST of the result list. [extractHrefs(Rest, StemUrl)] end. % Store the content in a ziggy-txt file. storeBody(Body, FileNum) -> % Filter the /r/n newlines out of the body and replace with Erlang's ~n lewlines to get the line breaks to write right. FilteredBody = replaceInString(replaceInString(Body, "\n", ""), "\r", ""), % Get the current date (for human readability) and now() (for machine uniqueness). {Year,Month,Day} = date(), {MegaSecs, Secs, MicroSecs} = now(), % Build the file name: ziggy-YYYY-MM-DD-MegaSecs-Secs-MicroSecs-FileNum.html We'll use the .html extention for easy viewing and previewing on most OSes and to make importing to Lucene/Solr a little easier. FileName = "ziggy" ++ "-" ++ integer_to_list(Year) ++ "-" ++ integer_to_list(Month) ++ "-" ++ integer_to_list(Day) ++ "--" ++ integer_to_list(MegaSecs) ++ "-" ++ integer_to_list(Secs) ++ "-" ++ integer_to_list(MicroSecs) ++ "-" ++ integer_to_list(FileNum) ++ ".html", % Open (or overwrite) our file ... io:fwrite("Storing content in ~p.~n", [FileName]), {ok, Handle} = file:open(FileName, [write, {encoding, utf8}]), % ... write the Content ... io:fwrite(Handle, "~p~n", [FilteredBody]), % ... and close it. file:close(Handle). % Extract the beginning stem of the URL for use later in addFront. stemUrl(Url0) -> % 1. Remove the leading "http://" if it's there. Url1 = replaceInString(Url0, "http://", ""), % 2. Remove everything after the first "/" so we get only the www.example.com portion and nothing after it. FirstSlashPos = string:str(Url1, "/"), if FirstSlashPos > 0 -> string:left(Url1, FirstSlashPos-1); % Note: We cannot have a trailing slash on the StemUrl because we'll be prepending it later. ?else -> Url1 end. % Add a trailing slash to a string (hopefully a URL in this context) if it doesn't already have one. % Scheme-Based Normalization - See RFC 3986 section 6.2.3. trailingSlash(Url) -> UrlEnd = string:right(Url, 1), if UrlEnd /= "/" -> string:concat(Url, "/"); ?else -> Url end. % Return a slash or "" for two to-be-concatenated URLs. getSlashIfNeeded(UrlPart1, UrlPart2) -> Part1EndsWithSlash = (string:right(UrlPart1, 1) == "/"), Part2BeginsWithSlash = (string:left (UrlPart2, 1) == "/"), if Part1EndsWithSlash or Part2BeginsWithSlash -> ""; ?else -> "/" end. addFront(StemUrl, NewUrl) -> NewUrlStartsWithHttp = string:left(NewUrl,7) == "http://", if NewUrlStartsWithHttp -> % It begins with http:// so don't mess with it. NewUrl; ?else -> % NewUrl DOES NOT begin with the http:// so let's work on it: % 1. Check to see if the the NewUrl begins with the BaseUrl NewUrlStartsWithStemUrl = string:left(NewUrl, length(StemUrl)) == StemUrl, if NewUrlStartsWithStemUrl -> % It does, so add nothing. AddBase = ""; ?else -> % NewUrl DOES NOT begin with the BaseUrl, so we need to add it in. AddBase = StemUrl end, % 2. Prepend "http://" and the AddBase we calculated. "http://" ++ AddBase ++ getSlashIfNeeded(AddBase,NewUrl) ++ NewUrl end. isFile(Url) -> % Hypothesis: It's a filename if the last occurrence of a dot is after the last occurrence of a slash. DotPosition = string:rstr(Url,"."), LastSlashPosition = string:rstr(Url,"/"), (DotPosition > LastSlashPosition). replaceInString(Str, This, That) -> Pos = string:str(Str,This), % Find the position of THIS in the string and . . . if (Pos > 0) -> Part1 = string:sub_string(Str, 1, Pos-1), Part2 = string:sub_string(Str, Pos+length(This), length(Str)), Str1 = Part1 ++ That ++ Part2, % . . . replace it with THAT. replaceInString(Str1, This, That); ?else -> Str end. % Normalize the passed-in URL according to RFC 3986 and perhaps other criteria as well. normalizeUrl(Url0) -> % 1. Make everything lowercase. Case Normalization - See RFC 3986 section 6.2.2.1 at http://tools.ietf.org/html/rfc3986#section-6 Url1 = string:to_lower(Url0), % 2. Remove any reference to :80 (port 80) because it's unnecessary. - Path Segment Normalization - RFC 3986 section 6.2.2.3 Url2 = replaceInString(Url1, ":80", ""), % 3. Replace /./ with / - More Path Segment Normalization - TODO: Handle ../ as well. Url3 = replaceInString(Url2, "/./", "/"), % 4. If there's a page-jump fragment (the "#" in ...example.com/help#email) then remove it and everything that follows it. (Still Path Segment Normalization) FragmentPos = string:str(Url3,"#"), if FragmentPos > 0 -> Url4 = string:left(Url3, FragmentPos-1); ?else -> Url4 = Url3 end, % Finally . . . FinalUrl = Url4, % Check to see if it's a filename or a directory and handle it accordingly. UrlIsFile = isFile(FinalUrl), if UrlIsFile -> % If it's a filename (and not a directorty) then just return it as is. FinalUrl; ?else -> % It's (hopefully) a directory so add a trailing slash IF IT'S NOT ALREADY THERE. - See RFC 3986 section 6.2.3. trailingSlash(FinalUrl) end.