(*
 * LANGUAGE    : ANS Forth
 * PROJECT     : Forth Environments
 * DESCRIPTION : check list of URLs for new data
 * CATEGORY    : Tool
 * AUTHOR      : Marcel Hendrix 
 * LAST CHANGE : Saturday, August 31, 2002 2:00 PM, Marcel Hendrix, cosmetics
 * LAST CHANGE : August 18, 2002, Marcel Hendrix 
 *)



	NEEDS -miscutil
	NEEDS -sockets
	NEEDS -assemble

	REVISION -scoot " Scoot -- URL fetch  Version 1.03 "

	PRIVATES

DOC
(*
  WHAT
  ----
  The idea is to check if any of the URLs in the "idata/saved-urls.dat" file have
  changed since last time we checked. This is done by computing the 32-bit 
  CRC of these pages' HEAD output.
  You can check URLs in parallel with browsing the 'net in the "normal" way.

  BUGS
  ----
  There is a large timeout when a page is off-line or not accessible.
  I don't see an easy fix for that. (It may be possible to change
  ACCUMULATE to have a time-out.)
  A page's server and path are concatenated by the "^" character which is 
  a bit of a hack.

  EXAMPLE OUTPUT
  --------------
	FORTH> update-crcs
	  0 stable :: www.tim-mann.org^chess.html
	  1 stable :: home.iae.nl^users/mhx/index.html

  TYPICAL GET Request
  -------------------
  See RFC2068.
	S" www.quartus.net^/discus/messages/board-topics.html" GET-HEAD CR TYPE
	HTTP/1.1 200 OK
	Date: Sat, 14 Sep 2002 20:13:59 GMT
	Server: Apache/1.3.26
	Last-Modified: Sat, 14 Sep 2002 12:14:36 GMT
	ETag: "12c530-161f-3d83282c"
	Accept-Ranges: bytes
	Content-Length: 5663
	Connection: close
	Content-Type: text/html
*)
ENDDOC

-- Tools -----------------------------------------------------------------

-- I forgot how to do this in high-level
CODE crc32  ( n1 char -- n2 )
    rpush,
    ebx pop, 
    edx -> [esp] xchg,		\ pop crc to edx
    8 b# -> ecx mov,		\ loop count
@@1:
    edx shr,			\ shift crc
    bh rcr,
    bl ror,			\ shift character
    ebx -> eax mov,		\ save character
    bh -> bl xor,
    @@2 offset SHORT jns,	\ skip if equal
    $EDB88320 d# -> edx xor, 	\ crc-32 polymial 1 04C1 1DB7
@@2:
    eax -> ebx mov,		\ restore character
    @@1 loop,			\ next bit
    edx -> [esp] xchg,		\ crc to tos
    rpop, ebx jmp,
END-CODE PRIVATE

-- calculate crc-32 of string
: crc-32  ( c-addr u -- n2 )
	-1 -ROT BOUNDS ?DO  I C@ crc32  LOOP INVERT ;P

-- http access -----------------------------------------------------------

: (GET?) ( size c-addr u c-addr1 u1 -- c-addr2 u2 )
	'^' (lex) 0= ABORT" (GET) :: invalid URL" DROP 2SWAP
	2OVER #80 OPEN-SERVICE LOCAL WWW
	2ROT 2SWAP $+ S"  HTTP/1.1" $+ +CR 
	S" Host: " $+ 2SWAP $+ +CR
	S" If-Modified-Since: Sat, 14 Sep 2002 10:43:31 GMT" $+
	S" Connection: close" $+ +CR
	+CR WWW WRITE-SOCKET
	WWW PAD ROT READ-SOCKET ( addr size ) 
	WWW CLOSE-SOCKET ;P

: (GET) ( size c-addr u c-addr1 u1 -- c-addr2 u2 )
	'^' (lex) 0= ABORT" (GET) :: invalid URL" DROP 2SWAP
	2OVER #80 OPEN-SERVICE LOCAL WWW
	2ROT 2SWAP $+ S"  HTTP/1.1" $+ +CR 
	S" Host: " $+ 2SWAP $+ +CR
	S" Connection: close" $+ +CR
	+CR WWW WRITE-SOCKET
	WWW PAD ROT READ-SOCKET ( addr size ) 
	WWW CLOSE-SOCKET ;P

\ Example:  S" www.tim-mann.org^/chess.html/index.html" GET-URL
: GET-URL ( c-addr1 u1 -- c-addr2 u2 )
	GET-SOCKET-TIMEOUT >S #10000 SET-SOCKET-TIMEOUT
	2>R UNUSED S" GET " 2R> (GET) 
	S> SET-SOCKET-TIMEOUT ;

: GET-URL? ( c-addr1 u1 -- c-addr2 u2 )
	GET-SOCKET-TIMEOUT >S #10000 SET-SOCKET-TIMEOUT
	2>R UNUSED S" GET " 2R> (GET?) 
	S> SET-SOCKET-TIMEOUT ;

: .URL ( c-addr1 u1 -- )
	GET-URL CR TYPE CR ;

: ./URL ( size c-addr1 u1 -- )
	GET-SOCKET-TIMEOUT >S #10000 SET-SOCKET-TIMEOUT
	  S" GET " 2SWAP (GET) CR TYPE CR 
	S> SET-SOCKET-TIMEOUT ;

\ Example:  S" www.tim-mann.org^/chess.html/index.html" GET-HEAD
: GET-HEAD ( c-addr1 u1 -- c-addr2 u2 )
	GET-SOCKET-TIMEOUT >S #5000 SET-SOCKET-TIMEOUT
	2>R #1024 S" HEAD " 2R> (GET) 
	S> SET-SOCKET-TIMEOUT ;

: $->DEC# ( c-addr1 u1 -- u2 ) 
	BASE @ >S DECIMAL 
	  0. 2SWAP >NUMBER 3DROP
	S> BASE ! ;

\ Example:  S" www.tim-mann.org^/chess.html/index.html" URL->CRC
: URL->CRC ( c-addr1 u1 -- size u3 )
	GET-HEAD 
	2DUP S" Content-Length: " DUP >S 
	SEARCH 0= IF  -S  2DROP  UNUSED ( unknown content length)
		ELSE  S> /STRING $->DEC#
	       ENDIF
	-ROT S" Last-Modified:" SEARCH 0= IF  3DROP 0 0 EXIT  ENDIF
	crc-32 ;


-- A small URL database --------------------------------------------------

0 VALUE rec#  	  PRIVATE
CREATE URL-DBASE  PRIVATE  HERE #100 CELLS DUP ALLOT ERASE

: cleanup ( u -- )
	DROP 
	URL-DBASE rec# 0 ?DO  @+ FREE ?ALLOCATE  LOOP DROP
	CLEAR rec# 
	URL-DBASE #100 CELLS ERASE ;P 

: ADD-URL ( c-addr1 u1 -- )
	rec# 100 U>= ABORT" add-url :: URL-DBASE full"
	DUP 2 CELLS + ALLOCATE ?ALLOCATE ( -- c-addr u addr )
	CELL+ CELLPACK CELL- URL-DBASE rec# CELL[] ! 
	1 +TO rec# ;P

	' cleanup IS-FORGET ADD-URL

: ADD-CRC	URL-DBASE []CELL @ ! ;P  	( u rec# -- )
: ADD-DATA 	rec# >S ADD-URL	S> ADD-CRC ; 	( crc c-addr u -- )
: URL@ 		URL-DBASE []CELL @ CELL+ @+ ;P	( ix -- c-addr1 u1 )
: CRC@ 		URL-DBASE []CELL @ @ ;P		( ix -- u )
: job 		URL@ ./URL ;P			( size ix -- )

: RECORD->FILE ( crc c-addr u1 handle -- )
	>S ROT U>D #12 (UD.R) S~      S" ~ $+ 
	   2SWAP $+ 
	   S~ " ADD-DATA~ $+ 
	S> WRITE-LINE ?FILE ;P

: SAVE-URLS ( -- ) 
	S" ./idata/saved-urls.dat" W/O BIN CREATE-FILE ?FILE LOCAL handle
	URL-DBASE rec# 0 ?DO  @+ @+ SWAP @+ handle RECORD->FILE 
			LOOP  DROP
	handle CLOSE-FILE ?FILE ;P

: RESTORE-URLS 	S" ./idata/saved-urls.dat" INCLUDED ;P	( -- )

	RESTORE-URLS

: .URLS ( -- )
	URL-DBASE
	rec# 0 ?DO  @+ @+ SWAP @+ CR I 3 .R ."  :: " TYPE 
		    C/L #20 - HTAB ." -- CRC = " H. 
	      LOOP 
	DROP ; 

-- Vector this to what you want done when a page changes -----------------

DEFER DO-SOMETHING ( size index -- ) ' job IS DO-SOMETHING

: UPDATE-CRCs ( -- )
	0 0 0 LOCALS| =size =ncrc =ocrc |
	URL-DBASE
	rec# 0 ?DO  
	      	    CR I 3 .R SPACE I URL@ TYPE
		    @+  @+ TO =ocrc @+ URL->CRC TO =ncrc TO =size
		    =ncrc I ADD-CRC  
		    =ocrc =ncrc <> IF  C/L #27 - HTAB ."  :: contents have changed" 
		    		       CR =size I DO-SOMETHING 
			         ELSE  C/L #27 - HTAB ."  :: stable" 
		    	        ENDIF
	      LOOP 
	DROP 
	SAVE-URLS ; 

: GET-HEADS   rec# 0 ?DO  CR I 3 .R SPACE I URL@ 2DUP TYPE CR GET-HEAD TYPE  LOOP ;

:ABOUT	CR .~ Try: S" <server>^<path>" GET-URL  ( -- c-addr u ) -- get URL content in a string~
	CR .~ Try: S" <server>^<path>" GET-HEAD ( -- c-addr u ) -- get URL HEAD content in a string~
	CR .~      S" <server>^<path>" .URL                     -- print page content~ 
	CR .~      S" <server>^<path>" URL->CRC ( -- u )        -- compute 32-bit CRC of page~ 
	CR .~ crc  S" <server>^<path>" ADD-DATA                 -- store crc entry in database file~
	CR ."      .URLS                                        -- print all items in database"
        CR ."      UPDATE-CRCs                                  -- update CRCs and perform actions" ;

		.ABOUT -scoot CR
		DEPRIVE

				(* End of Source *)