/*
	Introduction.

	We are to remember ALL visited URLs in one big list.
	It is intended to exclude repetitions,
	since hypertext links can contain multiple links to
	the same location and to the same document,
	and even loops.

	We will add all URLs to the linked list and mark
	visited ones in it.
	However we will NEVER delete URLs from this list!
*/
#include "defs.h"

char HTTP_PREFIX      [] = "http:"  ;

/* Create a new empty record about a hypertext reference */

URL *newURL(List *list, URL *parent, char *href, char *fullName){
	static unsigned long serial_number = 0;

	URL *ptr = (URL *) calloc(1, sizeof(URL));

	/* Join to the list */
	if(list->head == NULL){
		list->head = list->tail = ptr;
	} else {
		list->tail->next = ptr;
		list->tail       = ptr;
	}
	ptr->next = NULL;

	ptr->parentURL = parent;        /* list->currentURL        */
	ptr->counter   = 1;             /* reference counter       */
	ptr->serial    = ++serial_number;
	ptr->flags     = UNTOUCHED;     /* has not been copied yet */
	ptr->port      = HTTPPORT;
	ptr->retcode   = 000;
	ptr->level     = parent ? parent->level + 1 : 0;
	ptr->size      = 0;
	ptr->declared_size = 0;
	ptr->trys      = 0;
	ptr->hops      = 0;	/* not defined until name known */
	ptr->hrefs     = 0;

#ifdef DEBUG
	ptr->shortName = strdup(href);
#endif
	ptr->fullName  = fullName;      /* already strdup()ped */

	ptr->hostName  = NULL;
	ptr->urlName   = NULL;
	ptr->ctype     = NULL;
	ptr->location  = NULL;

	ptr->address_list = NULL;

	if(verbose)
		fprintf(fplog, "\t+++ newURL [%06ld]: %s\n", ptr->serial, fullName);

	return ptr;
}

/* ____________________________________________________________________________ */
/*
	http://host/docpath
	ftp://host/docpath
	file://docpath
	gopher://host/docpath
	wais://host/group
	mailto:user@host
	news:newsgroup
	local://url
	search://...

*/

char *URLtypes[] = {
	"http",

	"file",
	"ftp",
	"wais",
	"gopher",
	"mailto",
	"news",
	"telnet",
	"archie",
	"doc",          /* Hot Java's */
	"search",
	"future",
	NULL
};
char *HTTPtypes[] = {
	"http",
	NULL
};

/* "Full" here means "has the XXX:// or XXX:/ or XXX: prefix */
Bool isFullURL(char *href, char **rest){
	char **s;
	*rest = NULL;

	for(s = URLtypes; *s; s++){
		int len = strlen(*s);
		if(strncasecmp(href, *s, len) == 0 && href[len] == ':'){

			href[len] = '\0';
			lowercase((unsigned char *)href);
			href[len] = ':';

			/*
				http://hostname/pathname
				       ^
				       rest

				http:/pathname
				     ^
				     rest
			 */

			/* will be used in computeFullName() */
			*rest = href + len + 1;
			if((*rest)[0] == '/' && (*rest)[1] == '/')
				(*rest) += 2;   /* http://hostname/.... */
			/* Now at the beginning of a hostname. */

			return TRUE;
		}
	}
	return FALSE;
}

Bool isFullURL3(char *href){
	char **s;

	for(s = URLtypes; *s; s++){
		int len = strlen(*s);
		if(strncasecmp(href, *s, len) == 0 && strncmp(href+len, "://", 3) == 0){

			href[len] = '\0';
			lowercase((unsigned char *)href);
			href[len] = ':';

			return TRUE;
		}
	}
	return FALSE;
}

Bool isNotHTTP(char *href){
	char **s;

	for(s=HTTPtypes; *s; s++){
		int len = strlen(*s);
		if(strncasecmp(href, *s, len) == 0) return FALSE;	/* HTTP */
	}
	return TRUE;
}

/* Count hops distance */
void setHops(URL *ptr){
	if(ptr->parentURL)
		ptr->hops = ptr->parentURL->hops;

	/* If name is different - add a hop */
	if(ptr->hostName && ptr->parentURL && ptr->parentURL->hostName &&
	   strcmp(ptr->hostName, ptr->parentURL->hostName))
		ptr->hops = ptr->parentURL->hops + 1;
}

Bool isLegalHost(List *list, URL *ptr){
	char *hostname = ptr->hostName;
	char biff[1024];
	extern Item *allowed_host;

	sprintf(biff, "%s:%u", hostname, ntohs(ptr->port));

	/* Recursion and too many hops ==> deny */
	if(maxhops > 0){
		if(ptr->hops > maxhops){
			fprintf(fplog, "\t@@@ TOO MANY HOPS: %d > %d PARENT: %s\n",
				ptr->hops, maxhops, ptr->parentURL->hostName);
			return FALSE;
		}
		/* else go to other tests */
	}
	/* Host is in denied list ==> deny */
	if(hostDenied(biff))
		return FALSE;

	/* If there is a list of the allowed hosts...
	 * If no recursion ==> just admit allowed hosts.
	 * If there is a recursion => admit ONLY allowed hosts and deny all others.
	 */
	if(allowed_host){
		if(hostAllowed(hostname))
			return TRUE;
		else if(recflag == TRUE)
			return FALSE;
	}

	if(recflag == TRUE)
		return TRUE;
	/* All filtered hosts are allowed */

	/* Check if this is the NAME of our very first host
	 *               OR
	 * addresses of both hosts are the same.
	 */
	if(list->head){
		/* check the name */
		if(strcasecmp(list->head->hostName, hostname) == 0)
			return TRUE;

		/* check aliases */
		if(list->head->address_list && oneOfAddresses(hostname, list->head->address_list)){
			/* change hostname for the ptr to the canonical one */

			fprintf(fplog,  "\t@@@ NAME changed: %s --> %s\n",
					       hostname, list->head->hostName);
			fprintf(stderr, "@@@ NAME changed: %s --> %s\n",
					       hostname, list->head->hostName);
			free(ptr->hostName);
			ptr->hostName = strdup((char *) list->head->hostName);

			return TRUE;
		}
		return FALSE;
	}
	/* list->head can have ANY name since it is the very first name ! */
	return TRUE;
}

/* ____________________________________________________________________________ */
char ThisDir    [] = "./";
char ThisDir__  [] = ".";
char ParentDir  [] = "../";
char ParentDir__[] = "..";

/* ...../xxxxx/yyyyy  --> ...../xxxxx   */
/* ...../xxxxx/yyyyy/ --> ...../xxxxx   */
/* cut off the last component of the u  */

/* chdir .. */

void CUTOFF(char *u){
	char *lastslash, *last;

	/* Trim the last '/' ----------------------------*/
	last = strlast(u);
	if(*last == '/' && last != u)
		*last = '\0';

	/* Trim the last component ----------------------*/
	lastslash = strrchr(u, '/');   /* Cannot be NULL */

	/* Don't touch the leading '/' -- root directory */
	if(lastslash && lastslash != u){
		*lastslash = '\0';
	} else {
		/* leave alone "/" */
		u[1] = '\0';
	}
}

/* returns malloc()ed string -- to be free()d in the future. */
/* currentDir here already has a canonical form.             */

char *canonize(char *currentdir, char *href, int *addflags){
	char *workplace;

	workplace = strdup(currentdir);
	/* a copy for the destructive manipulations */

	while(*href == '/') href++;

	while(*href != '\0'){

		/* Look at the components of the path step by step */

		/* If "." or ".." */
		if(strncmp(href, ParentDir,   STRLEN(ParentDir  )) == 0 ||
		   strcmp (href, ParentDir__                     ) == 0 ||
		   strncmp(href, ThisDir,     STRLEN(ThisDir    )) == 0 ||
		   strcmp (href, ThisDir__                       ) == 0
		){
			/* ------------------------------------------------------- */
			/*
			 * href="./something.html"
			 *      is equal to
			 * href="something.html"
			 *
			 */
			/* ------------------------------------------------------- */
			while(strncmp(href, ThisDir, STRLEN(ThisDir)) == 0 ||
			      strcmp (href, ThisDir__)                == 0
			){
				if(strcmp(href, ThisDir__) == 0)
					href += STRLEN(ThisDir__);
				else    href += STRLEN(ThisDir);

				while(*href == '/') href++;

				/* Just ignore "." */
			}

			/* ------------------------------------------------------- */
			/*
			 * href="../something.html"
			 *    where
			 * workplace="http://host/.../parent/thisdir/"
			 *
			 * Then cut the workplace to
			 *
			 * workplace="http://host/.../parent"
			 *
			 */
			/* ------------------------------------------------------- */
			while(strncmp(href, ParentDir, STRLEN(ParentDir)) == 0 ||
			      strcmp (href, ParentDir__) ==0
			){

				if(strcmp(href, ParentDir__) == 0)
					href += STRLEN(ParentDir__);
				else    href += STRLEN(ParentDir);

				while(*href == '/') href++;

				/* Stage B: Extract the name of parent directory:
				 * workplace="http://host/.../parent/thisdir"
				 *      to
				 * workplace="http://host/.../parent"
				 */
				CUTOFF(workplace);

				if(skipparentdir){
					fprintf(fplog, "\t@@@ Will be ignored because of href=\"..\"\n");
					*addflags |= IGNORED;
				}
			}

		} else {
			/* Extract the next component from the href and append
			 * it to the end of "workplace"
			 */

			char *s, *last, *newwork;

			while(*href == '/') href++;

			if(*href == '\0')
				break;

			s    = strchr(href, '/');       /* separator between components */
			last = strlast(workplace);

			if(s && s[1] != '\0'){          /* append href up to s-pointer */
				*s = '\0';
				newwork = (*last == '/') ?     /* APPEND TO THE END of workplace */
					strspl(workplace,      href, NULL) :
					strspl(workplace, "/", href, NULL) ;
				*s = '/';
				free(workplace);
				workplace = newwork;

				href = s+1;     /* after the separating '/' */

				while(*href == '/') href++;     /* skip separator(s) */

			} else {                        /* append all the tail - href */
				newwork = (*last == '/') ?     /* APPEND TO THE END of workplace */
					strspl(workplace,      href, NULL) :
					strspl(workplace, "/", href, NULL) ;

				free(workplace);
				workplace = newwork;

				href = strend(href);

				break;
			}
		}
	} /* end while */

	if(verbose)
		fprintf(fplog, "\t;;; %s\n", workplace);

	return workplace;
}
/* ____________________________________________________________________________ */
/* Detect: 	index.php?blah=blah/blah/blah
 * note slashes after the ? sign.
 */
int isBadPHP(char *ref){
	char *question;
	if(*ref == '?') return FALSE;	/* Yes, it is a PHP, but this case is handled separately */

	question = strchr(ref, '?');
	if(question == NULL)
		return FALSE;		/* Not PHP at all */

#if 0
	for(; ref < question; ref++)
		if(*ref == '/' || *ref == '\\' )
			return FALSE;	/* may be TRUE, but too complex for us */
#endif

	for(question++; *question; question++)
		if(*question == '/' || *question == '\\')
			return TRUE;	/* Bad PHP !!! */

	return FALSE;			/* Good PHP index.php?blah_blah_blah */
}
/* ____________________________________________________________________________ */
/*
	href="..."              OR      href=...
	src="..."                       src=...
	background="..."                background=...
	url="..."                       url=...

					But this must cause a warning message!

	Fully qualified names
	---------------------
	http://hostname/pathname
	ftp://hostname/pathname
	wais://hostname/pathname
	gopher://hostname/pathname
	mailto:emailaddress
	news:newsgroup

	Relative names (use current host)
	---------------------------------
	http:/pathname  (using the current host)
	file://pathname
	/pathname
	pathname

	Jumps inside the body (ignore #labelname)
	-----------------------------------------
	...pathname#labelname

	Paths in MS DOS
	---------------
	http://host/C:/pathname

	Just at this time I have no one codeline to manage such
	a poor style.
*/

char *computeFullName(List *list, URL *currentURL, char *href, int *addflags /* IN-OUT */){
	char *fullName, *s, *from;
	char *rest;
	char *canonicalName;

/*
	Must be done outside this function.
	*addflags = 0;
*/

#ifdef NOTDEF
	while(isspace(*href))
		href++;
#endif

	/* Curing the ill name... ========================= */
	if(ill_urls_flag){
		if( !strncmp(href, HTTP_PREFIX, STRLEN(HTTP_PREFIX))   &&
			     href[STRLEN(HTTP_PREFIX)  ] == '/'        &&
			     href[STRLEN(HTTP_PREFIX)+1] != '/'        &&
			     href[STRLEN(HTTP_PREFIX)+1] != '\0'
		){
			/*  href:/dir/doc
			 *       NOT
			 *  http://host/dir/doc
			 */
			fprintf(fplog, "\t@@@ Strange reference: %s, cutting to: %s\n", href, href + STRLEN(HTTP_PREFIX));
			href += STRLEN(HTTP_PREFIX);
		}
	}
	/* http:dir/file */
	if( !strncmp(href, HTTP_PREFIX, STRLEN(HTTP_PREFIX)) &&
	    href[STRLEN(HTTP_PREFIX)] != '\0'                &&
	    href[STRLEN(HTTP_PREFIX)] != '/'
	){
		fprintf(fplog, "\t@@@ Strange reference: %s, adding: %s\n", href, href + STRLEN(HTTP_PREFIX));

		addURL(list, href + STRLEN(HTTP_PREFIX), NoFlags);
	}
	/* END Curing the ill name... ===================== */

	if(isFullURL(href, &rest)) {            /* is a FULL NAME of URL */
		char *firstslash;

		/* if "rest" points to hostname, not to http:/pathname */
		if(rest && *rest != '/')
			lowercaseHostName((unsigned char *) rest);

		/*
			---------------------------------
			http://hostname/pathname
			^      ^       ^
			href   rest    firstslash
			---------------------------------
			http:/pathname
			^    ^
			href rest==firstslash
			---------------------------------
			http:filename.html
			^    ^
			href rest
			---------------------------------
		 */

		if(rest && (firstslash = strchr(rest, '/')) != NULL){
			canonicalName = canonize("/", firstslash+1, addflags);

			*firstslash = '\0';
			fullName = strspl(href /* prefix and hostname included */ ,
					  canonicalName,
					  NULL);
			*firstslash = '/';

			free(canonicalName);
		} else
			fullName = strdup(href);
			/* for mailto:xxx@yyy.com -- containing no slashes at all */

	}
	else if(currentURL) {                   /* must construct the full name */

		/* Then http://<currentHost>.....       supposed
		 * <currentHost> will be taken from the currentURL
		 */

		/* parsed earlier at the moment of adding of currentURL */
		char *host = currentURL->hostName;
		char *url  = currentURL->urlName;
		char *thisDir;
		/* "url" has a canoncal form, begins with '/' */

		lowercase((unsigned char*)host);

		if(host == NULL){
			fprintf(fplog, "\t!!! Host is undefined (%lu)!!!\n", currentURL->serial);
			host = "nohost";
		}
		if(url == NULL){
			fprintf(fplog, "\t!!! URL is undefined (%lu)!!!\n", currentURL->serial);
			url = "/none.none";
		}
		/* Since we do use current URL, its full name (with stripped hostname)
		 * MUST begin with '/' -- from the root directory !!!
		 */
		if(*url != '/'){
			fprintf(fplog, "\t!!! URL name does not begin with '/' %s (%lu)!!!\n",
				       url, currentURL->serial);

			/* fix this */
			url = strspl("/", url, NULL);
			free(currentURL->urlName);
			currentURL->urlName = url;
		}
		/* ========================================================== */
		switch(*href){
		case '?':	/* PHP case with the HREF='?.....' */
			/* PHP case #1
				2:
				url='/xxx/xxx/xxx?blablablah"
				href="?QQQQ"
				result="/xxx/xxx/xxx?QQQQ"
			 */
			s = strchr(url, '?');
			if(s == NULL){	/* Don't change url */
				canonicalName = strspl(url, href, NULL);
			} else {
				*s = '\0';
				canonicalName = strspl(url, href, NULL);
				*s = '?';
			}
			break;

		case '/':	/* ABSOLUTE PATHNAME */
			canonicalName = canonize("/", href+1, addflags);
			break;

		default:	/* RELATIVE PATHNAME */
			if(isBadPHP(url)){
			/* PHP case #2
				1:
				url='/sections/index.php?article=books/kkm/index.htm'
				href='index.php?article=books/kkm/g1_1.htm'
				result='http://www.synergetic.ru/sections/index.php?article=books/kkm/g1_1.htm'
		 	 */
				s = strchr(url, '?');
				if(s == NULL) s = url;

				for(; url <= s; s--)
					if(*s == '/')
						break;
			
				if(*s != '/'){
					fprintf(fplog, "\t!!! No / in URL %s\n", url);
					die(666);
				} else {
					char saved = s[1];
					s[1] = '\0';
					canonicalName = canonize(url, href, addflags);
					s[1] = saved;
				}
				break;	/* switch */
			}
			/* determine the current directory
			   (directory containing the "url")

				/....../dir/        ---> /....../dir/
				/....../dir/xx.html ---> /....../dir/

			 */
			/* I remind again that "url" begins with '/' */
			thisDir = strdup (url);
			s       = strlast(url);

			if(*s == '/')   /* name already is a directory */
				;

			else    {

				/* Stage A: Extract the name of the current directory:
				 * currentURL="http://host/.../parent/thisdir/thisdoc.html"
				 * currentDIR="http://host/.../parent/thisdir/"
				 */
				if((s = strrchr(thisDir, '/')) != NULL)
					s[1] = '\0';

				/* Even in the case thisDir == "/" or "/xxxx"
				 * convert it to "/"
				 */
			}

			/* Here:
				thisDir ends with '/' and begins with '/'.
			 */

			canonicalName = canonize(thisDir, href, addflags);
			free(thisDir);
			break;
		}
		/* canonicalName is a name of url
		   that begins with '/'
		   and don't contain "." or ".." components.
		 */

		fullName = strspl(HTTP_PREFIX, "//", host, canonicalName, NULL);

		free(canonicalName);
		/* ========================================================== */

	} else {
otherwise:
		fprintf(fplog, "\t!!! Warning: unknown URL name format !!!\n");
		fullName  = strdup(href);
	}

	/*
		Strip out the anchor name:

			...host/urlname#label   ->      ...host/urlname
			...host/urlname/#label  ->      ...host/urlname
	*/
	if((from = strrchr(fullName, '/')) != NULL){
		if((s = strrchr(from, '#')) != NULL){
			if(!nowarnflag){
				fprintf(fplog,  "\t*** Warning: %s stripped to ", fullName);
				fprintf(stderr, "\t*** Warning: %s stripped to ", fullName);
				/* to be continued later... */
			}

			*s = '\0';

			if(s[-1] == '/')        /* ....../#...... */
			   s[-1]  = '\0';       /* ......         */

			if(!nowarnflag){
				fprintf(fplog,  "%s\n", fullName);
				fprintf(stderr, "%s\n", fullName);
			}
		}
	}
	if(stripOnSemicolon && (from = strrchr(fullName, '/')) != NULL){
		if((s = strrchr(from, ';')) != NULL){
			if(!nowarnflag){
				fprintf(fplog,  "\t*** Warning: %s stripped to ", fullName);
				fprintf(stderr, "\t*** Warning: %s stripped to ", fullName);
				/* to be continued later... */
			}

			*s = '\0';

			if(s[-1] == '/')        /* ....../;...... */
			   s[-1]  = '\0';       /* ......         */

			if(!nowarnflag){
				fprintf(fplog,  "%s\n", fullName);
				fprintf(stderr, "%s\n", fullName);
			}
		}
	}
	return fullName;
}

/* ____________________________________________________________________________ */

/* Extract fields "hostName", "urlName", "port" from the "fullName" */

void parseName(URL *ptr){
	char *hostname, *urlname;
	char *scol, *sroot, *sport;

	char *name = ptr->fullName;
	/* must be qualified with http:// ftp:// etc */

retry:
	scol = strchr(name, ':');
	/*
		name
		|
		http://hostname/urlpathname
		    |
		    scol
	*/

	if(scol == NULL){
		fprintf(fplog, "\t@@@ Bad URL name format: %s\n", name);

		/* fix this */
		name = strspl(HTTP_PREFIX, "//", name, NULL);
		free(ptr->fullName);
		ptr->fullName = name;

		fprintf(fplog, "\t@@@ Supposing %s\n\n", name);
		goto retry;

	} else {
		*scol = '\0';

		/* Flag BASEHREF may come from the <BASE HREF=...> */
		if(isNotHTTP(name) && !(ptr->flags & BASEHREF)){
			ptr->flags |= IGNORED;

			*scol = ':';    /* restore */
			fprintf(fplog, "\t*** Don't process this kind of URLs: %s\n\n", name);

			return;
		}
		name = scol+1;
		*scol = ':';    /* restore */

		/*
			     name
			     |
			http://hostname/urlpathname
			    |
			    scol
		*/
	}
	while(*name == '/') name++;

	sroot = strchr(name, '/');
	/*
		       name
		       |
		http://hostname/urlpathname
		    |          |
		    scol       sroot
	*/
	if(sroot == NULL){
		lowercase((unsigned char *)name);       /* hostname */
		    hostname = strdup(name);
		    urlname  = strdup("/");             /* HOME PAGE */

		 /* urlname  = strdup("/" INDEXNAME); */


	} else {
		*sroot = '\0';
		    lowercase((unsigned char *)name);   /* hostname */
		    hostname = strdup(name);
		*sroot = '/';

		while(*sroot == '/') sroot++;
		sroot--;
		    urlname = strdup(sroot);
	}

	/* hostname may be in the form HOST:port */
	if((sport = strrchr(hostname, ':')) != NULL){
		int nport;

		*sport = '\0';
		if(reassignportsflag){
			ptr->port = htons(nport = atoi(sport+1));
#ifdef DEBUG
			fprintf(fplog, "\t*** Port number %d is declared for %s\n\n",
							  nport,             ptr->fullName);
#endif
		}

	} /* else ptr->port = HTTPPORT; (has already been done by newURL) */

	/* Now, remember the parsed parts of the "ptr->fullName" in the "ptr". */

	if(ptr->hostName == NULL)
		ptr->hostName = hostname;
	else    free(hostname);

	if(ptr->urlName == NULL)
		ptr->urlName = urlname;
	else    free(urlname);
}

/* ____________________________________________________________________________ */
void quoteHref(char *dst, char *src){
	char tmp[4];
	char *s;

	while(*src){
		if(strchr(" \t", *src)){
			sprintf(tmp, "%%%02X", *src & 0xFF);
			for(s=tmp; *s; s++)
				*dst++ = *s;
		} else
			*dst++ = *src;
		src++;
	}
	*dst = '\0';
}
/* ____________________________________________________________________________ */
void dropQuot(char *s){
	int changed;
	char *p;

	do {
		changed = 0;
		for(p=s; *p; p++){
			if(!strncmp(p, "&quot;;", 7)){
				strcpy(p, p+7);
				changed++;
				break;
			}
			if(!strncmp(p, "&quot;", 6)){
				strcpy(p, p+6);
				changed++;
				break;
			}
		}
		
	} while (changed);
}
/* ____________________________________________________________________________ */
void filterWord(char *href, char *word){
	/* ......&amp;....	=> .......&....... */
	char *p;
	int found;
	int wordlen = strlen(word);

	do {
		found = 0;
		for(p=href; *p; p++){
			if(!strncmp(p, word, wordlen)){
				found++;
				p++;
				strcpy(p, p+wordlen-1);
				if(verbose)
					fprintf(fplog, "\tFILTER: <<%s>>\n", href);
				break;
			}
		}
	} while(found);
}
/* ____________________________________________________________________________ */

/* Add URL record to the list. Check if it is already there */

int add_counter;

/* <base href=...> (if any) affects computing of computeFullName() func. */

URL *addURL(List *list, char *href, int addflags /* usually = 0 */){
	char *fullName, *s;
	URL *ptr;
	int href_hacked = 0;	/* apply free() if is set to 1 */

	if(slashflag){
		char *sss;

		for(sss=href; *sss; sss++){
			if(*sss == '\\') *sss = '/';
		}
	}
	if(drop_quot)
		dropQuot(href);

	if(trimURLspaces){
		if(verbose)
			fprintf(fplog, "\t2TRIM: <<%s>>\n", href);
		/* Trim head */
		while(isspace(*href)) href++;

		/* Trim tail */
		s = href;
		while(*s) s++;
		for(--s; s >= href; s--){
			if(isspace(*s)) *s = '\0';
			else break;
		}
		if(verbose)
			fprintf(fplog, "\tTRIMD: <<%s>>\n", href);
	}
	if(substitute_amp){
		/* ......&amp;....	=> .......&....... */
		filterWord(href, "&amp;");
	}

	/* Hack against local://filename */
	if(strncasecmp(href, "local:", 6)==0){
		if(verbose)
			fprintf(fplog, "\tDROP local: <<%s>>\n", href);
		href += 6;
		while(*href == '/') href++;
	} else if(strncasecmp(href, "data:", 5)==0){
		if(verbose)
			fprintf(fplog, "\tDROP data: <<%s>>\n", href);
		href += 5;
		while(*href == '/') href++;
	}


	/* We have to use %20 instead of the SPACE in the href="" */
	if(strchr(href, ' ')){
		char *new_href;

		href_hacked++;
		new_href = (char *) calloc(strlen(href)*3 + 1, 1);
		/* 1 char may become 3 ones */
		quoteHref(new_href, href);
		href = new_href;
	}

	/* Compute the full name of the new URL */
	/* computeFullName() returns malloc()ed string */

	fullName = computeFullName(list,
				   basehref ? basehref : list->currentURL,
				   href,
				   &addflags /* usually =0 */
				   );
	if(fullName == NULL || !*fullName){
		if(href_hacked) free(href);
		return NULL;
	}

	if(verbose)
		fprintf(fplog, "\t+++ addURL: %s --> %s\n", href, fullName);
/* ??? */
	if((addflags & IGNORED) && !(addflags & FORCED)){
		/* computeFullName() told us that this name is not to process */
		return NULL;
	}
/* ??? */

	/* Let's think a moment if there can be an empty URL name. Yes, it can:
		HREF="#label12"
		(reference to the other point inside the same document)
		After the striping out '#' it becomes "".
	 */

	if(xxdecodflag >= 2){
		deXXname(href);
		deXXname(fullName);
	}

	add_counter++;  /* count added names */

	/* search in the list if this URL is already there */
	for(ptr=list->head; ptr != NULL; ptr=ptr->next){
#ifdef DEBUG_CMP
		if(verbose) fprintf(fplog, "\t--- Check: %04d '%s', '%s'\n", ptr->serial, fullName, ptr->fullName); /* @ABS@ */
#endif

		if(strcmp(ptr->fullName, fullName) == 0){

			/* yes, already there */
			if(verbose)
				fprintf(fplog, "\t+++ THERE: %s\n", fullName);
			ptr->counter++;
			free(fullName);

			if(href_hacked) free(href);
			return ptr;     /* existing one */
		}
	}

	/* else add new one */
	ptr = newURL(list, list->currentURL, href, fullName);

	/* extract parts of the full name
	   and canonize them
	 */
	parseName(ptr);
	if(underflag && ptr == list->head){
		char *ss;
		underdir = strdup(ptr->urlName);
		enXXname(underdir);
		ss = strrchr(underdir, '/');
		if(ss) ss[1] = '\0';

		fprintf(fplog,  "@@@ Restrict paths to directory: %s\n", underdir);
		fprintf(stderr, "@@@ Restrict paths to directory: %s\n", underdir);
	}

	if(skipparents){
		if(isParent(list->currentURL, ptr)){
			fprintf(fplog, "\t@@@ Will be ignored because of href to parent\n");
			addflags |= IGNORED;
		}
	}
	ptr->flags |= addflags;

	(void) skipExisting(list, ptr);

	if(href_hacked) free(href);
	return ptr;
}

Bool skipExisting(List *list, URL *ptr){

	if(appendflag &&
	   isUntouched(ptr)  &&           /* legal document (to be processed at all) */
	   ptr != list->head &&           /* however DO process the first document   */
	   ptr->hostName && ptr->urlName  /* are defined                             */
	){

		/* Test if such FILE already exists and is not zero-length */

		char outname[MAXPATHLEN], *postfix = "";
		struct stat st;
		Bool isIndex;

		isIndex = computeFileName(outname, ptr->hostName, ptr->urlName, ptr->port, postfix);

		if(weak_appendflag || reget_appendflag){

			if(isIndex == TRUE)     /* reget dirs always */
			      return FALSE;
		}
		if(weak_appendflag){

			/* always reget HTML files (but not others) */

			if(   strsuffix(ptr->urlName, ".html")
			   || strsuffix(ptr->urlName, ".htm")
			   || strsuffix(ptr->urlName, ".shtml")
			   || strsuffix(ptr->urlName, ".phtml")
			   || strsuffix(ptr->urlName, ".rhtml")
			   || strsuffix(ptr->urlName, ".xml")
			   || strsuffix(ptr->urlName, ".asp")
			   || strsuffix(ptr->urlName, ".php")
			)     return FALSE;
		}

		if(stat(outname, &st) >= 0 && st.st_size > 0 && !isdir(st)){
			/* Yes, it does exist.
			 * Don't process this URL
			 */
			fprintf(fplog, "\t*** File exists, skip: %s\n\n", ptr->fullName);

			ptr->flags |= EXISTS;

			return TRUE;    /* do skip it */
		}
	}
	return FALSE;
}

/* ____________________________________________________________________________ */
/* The interface for the first call from main() */

void addFirst(List *list, char *href){
	list->currentURL = NULL;
	list->head = list->tail = NULL;

	(void) addURL(list, href, NoFlags);
}
void loadURLFile(List *list, char *filename){
	FILE *fp;
	char buffer[16 * 1024], *s, *xbuffer, *post_data;
	URL *save, *added;
	int flags = NoFlags;

	if((fp = fopen(filename, "r")) == NULL){
		myperror(filename);
		return;
	}
	while(fgets(buffer, sizeof buffer, fp) != NULL){
		if((s = strchr(buffer, '\n')) != NULL)
			*s = '\0';

		xbuffer = buffer;
		post_data = NULL;

		/* The only way to request POST service is from the file */
		/* "POST http://..... POST data" */
		if(!strncmp(buffer, "POST ", 5)){
			flags = USEPOST;
			xbuffer += 5;

			for(s=xbuffer; *s; s++){
				if(!strncmp(s, " POST ", 6)){
					*s = '\0';
					post_data = strdup(s+6);
					break;
				}
			}
		}
		save = list->currentURL;
		list->currentURL = list->tail;

		added = addURL(list, xbuffer, flags);
		fprintf(stderr, "### %06ld Added %s\n", added->serial, added->fullName);

		if(post_data)
			added->post_data = post_data;

		list->currentURL = save;
	}
	fclose(fp);
}

/* ____________________________________________________________________________ */

/* Get URL from the WWW server                */
/* Simultaneously find all the HREFs included */

char CGIbin         [] = "/cgi-bin/";
char CGIbin_images  [] = "/cgi-bin/images/";
char CGIbin_imagemap[] = "/cgi-bin/imagemap/";

void processURL(List *list, URL *ptr /* = list->currentURL */ ){

	char *fullName = ptr->fullName;
	/* and parsed parts */
	char *hostname = ptr->hostName;
	char *urlname  = ptr->urlName;

	int rest, to_retry;
	int is_image;

	if(ptr->flags & IGNORED){
		fprintf(fplog, "\t@@@ Skip this URL: %s\n\n", fullName);
		return;
	}

	if(urlcounter > 0 && ptr->serial >= urlcounter+1){
		fprintf(fplog,  "### The URL counter exceeded (%lu) -- exitting.\n", urlcounter);
		fprintf(stderr, "### The URL counter exceeded (%lu) -- exitting.\n", urlcounter);

		die(0);
	}

	rest = countRemaining(list, &to_retry);

	setHops(ptr);
	fprintf(fplog, "### %s \"%s\"\n### [%lu.%ld:%d/%ld] %s\n### [%d left; %d to retry]\n",
		ptr->trys > 0 ? "Retrying" : "Processing",
		fullName,
		ptr->serial, ptr->level, ptr->hops, ptr->trys,
		currentDate(),
		rest, to_retry
	);
	if(debugflag > 1) fprintf(stderr, "Processing %s\n", fullName);

	if((ptr->flags & IGNORED) || isNotHTTP(fullName)){
		fprintf(fplog, "\t@@@ Don't process this kind of URLs: %s\n\n", fullName);

		ptr->flags |= PROCESSED; /* processed */
		return;
	}

	is_image = isimage(ptr);
	/* May be always pass images? */

	if(!(redimages && is_image) && checkIfToSkip(ptr) == TRUE){
		ptr->flags |= (PROCESSED|IGNORED); /* processed */
		return;
	}

	if(skiprootdir && (strcmp(urlname, "/") == 0 || strcmp(urlname, "/" INDEXNAME) == 0)){
		fprintf(fplog, "\t@@@ Dont parse HREFs in root index: %s\n\n", fullName);

		ptr->flags |= DONTPARSE;
		/* Get the document itself, but don't follow its HREFs */
	}

	fprintf(fplog, "\tHOST: %s:%u\n", hostname, ntohs(ptr->port));
	fprintf(fplog, "\tURL:  %s\n\n",  urlname);

	/* href=http://host/cgi-bin/imagemap/dir/name.map
		must be really get as the file
		http://host/dir/name.map
	 */
	if(strncasecmp(urlname, CGIbin_imagemap, STRLEN(CGIbin_imagemap)) == 0){
		addURL(list, urlname + STRLEN(CGIbin_imagemap) - 1, FORCED);
		fprintf(fplog, "\t@@@ Adding Image Map: %s\n", urlname + STRLEN(CGIbin_imagemap) - 1);
	}

	/* no "else" here */

	if(skipcgiflag &&
		(strncasecmp(urlname, CGIbin,        STRLEN(CGIbin))        == 0 &&          /* equals */
		 strncasecmp(urlname, CGIbin_images, STRLEN(CGIbin_images)) != 0             /* not equals */
		)
	){
		fprintf(fplog, "\t@@@ Don't process CGI scripts\n");
		ptr->flags |= IGNORED;

	} else  if(isLegalHost (list, ptr) == TRUE ){   /* check hostname = ptr->hostName */

		if(underdir && ptr->urlName &&
			! strsuffix(ptr->urlName, ".txt") &&    /* suffix is NOT .txt */
			! is_image			  &&
			! (ptr->flags & REDIRECTEDTO)     &&    /* is not redirected URL */
			strncmp(ptr->urlName, underdir, strlen(underdir)) != 0   /* differs */
		){
			/* Ignore it */
			ptr->flags |= IGNORED;
			fprintf(fplog, "\t@@@ Ignored, not under %s\n", underdir);
		}

		else

		/* Check again after the possible name change */
		if(skipExisting(list, ptr) == FALSE){    /* don't skip this URL */
			int code = callHost(list, ptr);
			ptr->flags |= code;
			if(humanlike){		/* wait */
				int waittime = 60 - rand()%30;
				fprintf(stderr, "Pause %d seconds\n", waittime);
				sleep(waittime);
			}
		}
	} else {
		fprintf(fplog, "\t@@@ Don't look at host: %s:%u\n", hostname, ntohs(ptr->port));
		ptr->flags |= SKIPPED;
	}
	fprintf(fplog, "\n");

	ptr->flags |= PROCESSED; /* processed */

	if(ptr->flags & RETRY) fprintf(stderr, "### This document will be retried\n\n");

/* DEBUGGING SECTION */
	if(! noreport) reportShortList(list, fpreport);
}

/* ____________________________________________________________________________ */

void mainLoop(List *list){
	count404 = 0;
again:
	for(;;){
		if(error_counter > 5){
			fprintf(fplog,  "### TOO MANY WRITE ERRORS, EXITTING\n");
			fprintf(stderr, "### TOO MANY WRITE ERRORS, EXITTING\n");

			die(0);
		}
		if(limit404 > 0 && count404 > limit404){
			fprintf(fplog,  "### TOO MANY 404 NOT_FOUND ERRORS, EXITTING\n");
			fprintf(stderr, "### TOO MANY 404 NOT_FOUND ERRORS, EXITTING\n");

			die(0);
		}

		/* Check REGETs */
		for(list->currentURL =  list->head;
		    list->currentURL != NULL;
		    list->currentURL =  list->currentURL->next)
		{
			if(list->currentURL->flags & REGET){

				if(list->currentURL->trys < 2){  /* retry only once */

					fprintf(fplog,  "### Regetting\n");
					fprintf(stderr, "### Regetting\n");

					setUntouched(list->currentURL);

					/* Try to get !RANDOM or size != 0 */
					processURL(list, list->currentURL);

					list->currentURL->flags &= ~REGET;      /* only once: erase new REGET */

					goto again;

				} else  list->currentURL->flags &= ~REGET;
			}
		}

		/* Check UNTOUCHED and RETRYs */
		for(list->currentURL =  list->head;
		    list->currentURL != NULL;
		    list->currentURL =  list->currentURL->next)
		{
			if((list->currentURL->flags & RETRY) && list->currentURL->trys < RETRY_IMMEDIATELY){
				/* retry it immediately */
				setUntouched(list->currentURL);
				processURL(list, list->currentURL);
				break;
			}
			/* else */
			if(isUntouched(list->currentURL)){
				processURL(list, list->currentURL);
				break;
			}
		}
		if(list->currentURL == NULL)
		/* done - the tail of the list is reached */
			break;
	}
}

int countRemaining(List *list, int *to_retry){
	URL *ptr;
	int count = 0, count_retry = 0;

	for(ptr=list->head; ptr; ptr=ptr->next){
		if(isUntouched(ptr))
			count++;
		if(ptr->flags & RETRY)
			count_retry++;
	}
	if(to_retry)
		*to_retry = count_retry;
	return count;
}

/* Mark URLs as not tried */
void retryNotReceived(List *list){
	URL *ptr;

	fprintf(stderr, "\n### NEXT PASS ###\n\n");
	for(ptr=list->head; ptr; ptr=ptr->next)
		if(ptr->flags & RETRY){
			if(ptr->trys == MAXTRYS){
				fprintf(fplog, "@@@ Too many retrials: %s\n", ptr->fullName);
				ptr->flags &= ~RETRY;
				continue;
			}
			setUntouched(ptr);
			/* i.e. ready for retrial */
		}
}
/* ____________________________________________________________________________ */

Bool computeFileName(char *outname, char *hostname, char *urlname, u_short port, char *postfix){
	char *s;
	char happendx[32];

	if(reassignportsflag && HTTPPORT != port){
		sprintf(happendx, ":%d", ntohs(port));
	} else  sprintf(happendx, "");

	s = strlast(urlname);
	if(mylogdir){
		if(*s == '/'){
			sprintf(outname, "%s/%s%s%s%s%s",
				mylogdir, hostname, happendx, urlname, INDEXNAME, postfix);
			deXXname(outname);
			enXXname(outname);
			return TRUE;    /* is index */
		} else {
			sprintf(outname, "%s/%s%s%s%s",
				mylogdir, hostname, happendx, urlname, postfix);
			deXXname(outname);
			enXXname(outname);
			return FALSE;    /* is not index */
		}
	} else {
		if(*s == '/'){
			sprintf(outname, "%s/%s/%s%s%s%s%s",
				HOME, SPOOLDIR, hostname, happendx, urlname, INDEXNAME, postfix);
			deXXname(outname);
			enXXname(outname);
			return TRUE;    /* is index */
		} else {
			sprintf(outname, "%s/%s/%s%s%s%s",
				HOME, SPOOLDIR, hostname, happendx, urlname, postfix);
			deXXname(outname);
			enXXname(outname);
			return FALSE;    /* is not index */
		}
	}
}

void cutRedir(char *s){
	char *p;
	if((p = strchr(s, '#')) != NULL)
		*p = '\0';
}

char ContentLength [] = "Content-length: ";
char ContentType   [] = "Content-type: ";
char HTTPreport    [] = "HTTP/1.0 ";
char TEXT_HTML     [] = "text/html";
char TEXT_PLAIN    [] = "text/plain";
char X_DIRECTOR    [] = "application/x-director";
char Location      [] = "Location: ";

int processDocument(List *list, URL *urlptr, char *filename){
	FILE *fpin, *fpout;
	State state = HEADER;
	char buffer[10000];
	char outname[MAXPATHLEN * 2], *postfix = "";
	char *s;
	size_t size      = NOSIZE;
	Bool is_html     = FALSE;
	Bool dosave      = TRUE;
	Bool redirection = FALSE;
	int c;
	HttpCode retcode = HTTP_OK;             /* 000 ??? */
	struct stat st;
	size_t nlines = 0L;

	char *hostname = urlptr->hostName;
	char *urlname  = urlptr->urlName;

	if((fpin = fopen(filename, "r")) == NULL){
		fprintf(fplog, "*** Cannot open %s for %s:%s\n", filename, hostname, urlname);
		return (-1);
	}
	while(fgets(buffer, sizeof buffer, fpin) != NULL){
		if((s = strchr(buffer, '\n')) != NULL) *s = '\0';
		if((s = strchr(buffer, '\r')) != NULL) *s = '\0';
		nlines++;

		if(debugflag >= 3)
			fprintf(fplog, ":::\t%s\n", buffer);

		if(strncasecmp(buffer, ContentLength, STRLEN(ContentLength)) == 0){
			urlptr->declared_size = size = atol(buffer + STRLEN(ContentLength));

			if(debugflag >= 2){
				fprintf(stderr, "\t%s\n", buffer);
				fprintf(fplog,  "\t%s\n", buffer);
			}
		} else
		if(strncasecmp(buffer, ContentType, STRLEN(ContentType)) == 0){
			s = buffer + STRLEN(ContentType);

			urlptr->ctype = strdup(s);

			     if(strncasecmp(s, TEXT_HTML, STRLEN(TEXT_HTML)) == 0)
				is_html = TRUE;
			else if(strncasecmp(s, X_DIRECTOR, STRLEN(X_DIRECTOR)) == 0)
				is_html = TRUE;
			else if(strsuffix(urlname, ".tmpl"))	/* www.cars.com */
				is_html = TRUE;
			else if(text_as_HTML && strncasecmp(s, TEXT_PLAIN, STRLEN(TEXT_PLAIN)) == 0)
				is_html = TRUE;

			if(debugflag){
				fprintf(stderr, "\t%s\n", buffer);
				fprintf(fplog,  "\t%s\n", buffer);
			}
		} else
		if(strncasecmp(buffer, Location, STRLEN(Location)) == 0){
			s = buffer + STRLEN(Location);

			urlptr->location = strdup(s);
			/* redirection = TRUE;	/* @@@ABS */
			/* dosave = FALSE;	/* @@@ABS */

			if(debugflag){
				fprintf(stderr, "\t%s\n", buffer);
				fprintf(fplog,  "\t%s\n", buffer);
			}

		} else

	/* HTTP/1.0 200 Document Follows
	   HTTP/1.0 301 Moved Permanently
	   HTTP/1.0 404 Not Found
	   HTTP/1.0 302 Found
	 */
		if(strncasecmp(buffer, HTTPreport, STRLEN(HTTPreport)) == 0
			&&
		   isdigit(buffer[STRLEN(HTTPreport)] )
		){
			if(debugflag)
				fprintf(stderr, "\t%s\n", buffer);

			retcode = atoi(buffer + STRLEN(HTTPreport));

			if(retcode == HTTP_NOTFOUND)
				count404++;

			if(retcode == HTTP_FOUND || retcode == HTTP_MOVED){
				redirection = TRUE;
				dosave      = FALSE;
			}
			/* Retry if there is a channel problem */
			if(retcode == HTTP_SERVERBAD || retcode == HTTP_GATEWAYBAD ||
			   retcode == HTTP_GATEWAYTIMEOUT || retcode == HTTP_UNKNOWNBAD ||
			   retcode == HTTP_REQTIMEOUT || retcode == HTTP_SERVICEBAD)
				urlptr->flags |= RETRY;

			if(retcode != HTTP_OK){
				if(keepflag){
					postfix = ".ERR";
					/* but dosave = TRUE; */
				} else {
					dosave = FALSE;
					fprintf(fplog, "\t@@@ Don't save it: %s\n", buffer);
				}
				urlptr->flags |= HTTPERROR;
			}
		}

		/* Empty line --> end of the header */
		if(!*buffer){
			state = BODY;
			break;
		}
	}
	urlptr->retcode = retcode;

	urlptr->hrefs = 0;
	add_counter = 0;        /* see addURL() */

	/* ................................................................. */
	if(redirection == TRUE){

		if(redirsaveflag){
			dosave = TRUE;
			urlptr->flags |= DONTPARSE;
		}

		if(urlptr->location == NULL){

			fprintf(fplog,  "\t@@@ Redirected %s has no redirection URL\n", urlptr->fullName);
			fprintf(stderr, "\t@@@ Redirected %s has no redirection URL\n", urlptr->fullName);

			if( *strlast(urlptr->fullName) != '/'){
				URL  *saveptr = list->currentURL;
				char *newname;

				list->currentURL = urlptr;
				newname = strspl(urlptr->fullName, "/", NULL);

				fprintf(fplog,  "\t@@@ Redirected to Index: %s\n", newname);
				fprintf(stderr, "\t@@@ Redirected to Index: %s\n", newname);

				addURL(list, newname, REDIRECTEDTO);
				free(newname);

				list->currentURL = saveptr;
			}

		} else {

			/* 302
			   Location: http://server/dir/doc
			   Location: /dir/doc                   (means http://thisserver/dir/doc)
			   Location: http:/dir/doc              (means http://thisserver/dir/doc)
			 */

			if(isFullURL3(urlptr->location)){
				if(cutRedirs) cutRedir(urlptr->location);
				fprintf(fplog,  "\t@@@ Redirection: %s -> %s\n", urlptr->fullName, urlptr->location);
				fprintf(stderr, "\t@@@ Redirection: %s -> %s\n", urlptr->fullName, urlptr->location);

				urlptr->flags |= REDIRECTED;

				addURL(list, urlptr->location, REDIRECTEDTO);

			} else {
				URL  *saveptr = list->currentURL;
				char *newname;

				/* Location: http:/newdir/newurl */
				if( strncmp(urlptr->location, HTTP_PREFIX, STRLEN(HTTP_PREFIX)) == 0)
					urlptr->location += STRLEN(HTTP_PREFIX);

				list->currentURL = urlptr;

				if(*urlptr->location == '/')
					newname = strspl(HTTP_PREFIX, "//", urlptr->hostName,      urlptr->location, NULL);
				else	newname = strspl(HTTP_PREFIX, "//", urlptr->hostName, "/", urlptr->location, NULL);
				

				if(cutRedirs) cutRedir(urlptr->location);
				fprintf(fplog,  "\t@@@ Redirection: %s -> %s\n", urlptr->fullName, newname);
				fprintf(stderr, "\t@@@ Redirection: %s -> %s\n", urlptr->fullName, newname);

				urlptr->flags |= REDIRECTED;

				addURL(list, newname, REDIRECTEDTO);
				free(newname);

				list->currentURL = saveptr;
			}
		}
	}
	/* ................................................................. */

	/* Save the rest into the cache file */
	/* This is the main purpose of the whole this program !!! */
	if(dosave == TRUE){
		char renamed_name[sizeof(outname) + 10];
		Bool renamed;
		Bool parsed = FALSE;

		computeFileName(outname, hostname, urlname, urlptr->port, postfix);

		if(headOnly){
			/* webcp -file HTTPLIST -verifysizes ... */
			/* To verify sizes of the already existing files */
			if(stat(outname, &st) >= 0 && st.st_size == size){
				printf("OK %s\n", urlptr->fullName);
			} else {
				printf("XX %s %lu %lu (%d)\n", urlptr->fullName, st.st_size, size, st.st_nlink);
			}
			goto out;
		}

		/* Create necessary directories */
		renamed = makepath(outname, redirection, renamed_name);

		if((fpout = fopen(nullflag ? "/dev/null" : outname, "w")) == NULL){
			myperror(outname);
			fclose(fpin);
			return (-1);
		} else {
			fprintf(fplog, "\t*** Saving to: %s\n", outname);
		}

		/* Here do:
		 *      save the URL file.
		 *      if it is HTML file - look for the HREF= in it.
		 */
		resetParser();
		urlptr->flags &= ~NOTPARSED;

		while((c = getc(fpin)) != EOF){

			putc(c, fpout);

			if(ferror(fpout)){
				fprintf(stderr, "### WRITE ERROR: %s\n", outname);
				fprintf(fplog,  "### WRITE ERROR: %s\n", outname);
				myperror("putc");
				error_counter++;

				break;
			}

			if(is_html && (urlptr->flags & DONTPARSE) == 0){
				checkChar(list, c & 0xFF);    /* see parse.c */
				parsed = TRUE;
			}
		}
		fclose(fpout);

		if(parsed == FALSE)
			urlptr->flags |= NOTPARSED;

		if(renamed == TRUE && compareFiles(outname, renamed_name) == TRUE){
			fprintf(fplog,  "\t@@@ New and Old are the same; unlinking New\n");
			fprintf(stderr, "\t@@@ New and Old are the same; unlinking New\n");
			unlink(outname);
			if(rename(renamed_name, outname) < 0){
				fprintf(stderr, "### RENAME ERROR: %s %s\n", renamed_name, outname);
				fprintf(fplog,  "### RENAME ERROR: %s %s\n", renamed_name, outname);
				myperror("rename");
			}
		}

		if(nullflag == FALSE){  /* i.e. there IS a real file */

			/* test for the correct size */
			stat(outname, &st);
			if(size == NOSIZE){
				fprintf(fplog, "\t*** Size: %lu (RANDOM)\n", st.st_size);
				urlptr->flags |= RANDOMSIZE;

				if(st.st_size == 0)     /* probably there was a packet jam */
					urlptr->flags |= REGET;

			} else {
				fprintf(fplog, "\t*** Size: %lu , declared: %lu (%s)\n",
					st.st_size, size,
					size == st.st_size ? "OK" : "ERROR"
				);
				urlptr->flags &= ~RANDOMSIZE;

				if(!anysizeflag && size != st.st_size){
					urlptr->flags |= (WRONGSIZE|RETRY);
					fprintf(stderr, "\t!!! Wrong size: %s (expected %lu, got %lu)\n",
									   urlptr->fullName,
									   size, st.st_size);
					if(!anysizeflag_save && retryflag){
						fprintf(fplog, "\t@@@ Corrupted file deleted: %s\n",
							       outname);
						unlink(outname);
					}
				}
			}
			urlptr->size = st.st_size;
		}

		if(picsonlyflag && isreg(st) && !isimage(urlptr)){
			unlink(outname);
			fprintf(fplog,  "\t@@@ Unlinking non Picture %s\n", outname);
			fprintf(stderr, "\t@@@ Unlinking non Picture %s\n", outname);
		}

	}
out:
	fclose(fpin);

	urlptr->hrefs = add_counter;
	if(add_counter > 0 && debugflag)
		fprintf(stderr, "\t%d references\n", add_counter);

	basehref = NULL;        /* Reset to "currentURL".
				   "base href=" is local to one document */

	return 0;
}

/* ____________________________________________________________________________ */

void reportList(List *list){
	if(debugflag > 3)
		reportLongList(list, fplog);
	reportShortList(list, fpreport);
}

void reportLongList(List *list, FILE *fp){
	URL *ptr;

	fprintf(fp, "---WWW SUMMARY REPORT---------------------------\n\n");
	for(ptr=list->head; ptr; ptr=ptr->next){

		if(ptr == list->currentURL)
			fprintf(fp, "* ");

		fprintf(fp, "%03d %03ld #%lu", ptr->retcode, ptr->level, ptr->serial);
		if(ptr->parentURL)
			fprintf(fp, " --> #%lu", ptr->parentURL->serial);

		fprintf(fp, "\n");
		fprintf(fp, "%s\n", ptr->fullName);
#ifdef DEBUG
		fprintf(fp, "%s\n", ptr->shortName);
#endif
		if(ptr->hostName) fprintf(fp, "\thost:\t%s\n", ptr->hostName);
		if(ptr->urlName)  fprintf(fp, "\turl:\t%s\n",  ptr->urlName);
		if(ptr->location) fprintf(fp, "\tloc:\t%s\n",  ptr->location);
		if(ptr->ctype)    fprintf(fp, "\ttype:\t%s\n", ptr->ctype);
		if(ptr->trys > 0) fprintf(fp, "\ttrys:\t%ld\n", ptr->trys);

		fprintf(fp, "\tcount:\t%ld\n", ptr->counter);
		fprintf(fp, "\tflags:\t%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
					   (ptr->flags & CONNECTED)     ? "connected ":"",
					   (ptr->flags & FAILED)        ? "failed "   :"",
					   (ptr->flags & SKIPPED)       ? "skipped "  :"",
					   (ptr->flags & IGNORED)       ? "ignored "  :"",
					   (ptr->flags & EXISTS)        ? "exists "   :"",
					   (ptr->flags & UNKNOWN)       ? "unknown "  :"",
					   (ptr->flags & WRONGSIZE)     ? "badsize "  :"",
					   (ptr->flags & HTTPERROR)     ? "HTTPfail " :"",
					   (ptr->flags & RETRY)         ? "retry "    :"",
					   (ptr->flags & DONTPARSE)     ? "dontparse ":"",
					   (ptr->flags & NOTPARSED)     ? "notparsed ":"",
					   (ptr->flags & RANDOMSIZE)    ? "random "   :"",
					   (ptr->flags & BASEHREF)      ? "base "     :"",
					   (ptr->flags & REDIRECTEDTO)  ? "redir "    :""
					/*
					   (ptr->flags & PROCESSED) ? "seen "     :""
					 */
		);
		fprintf(fp, "\n");
	}
}

void reportShortList(List *list, FILE *fp){
	URL *ptr;

	rewind(fp);
	for(ptr=list->head; ptr; ptr=ptr->next){

		fprintf(fp,
			"%c %c%c%c%c %c%03d %03ld %0ld %7lu %7lu %03ld %06lu %06lu %s",
			 ptr->retcode == HTTP_OK    ? 's': /* Success    */
			 ptr->retcode == 0          ? '-': /* Not tried  */
			 ptr->retcode == HTTP_FOUND ? '>': /* Redirected */
						      'f', /* HTTP request Failed */
			    (ptr->flags & RETRY)                                ? 'R':
			    (ptr->flags & IGNORED)                              ? 'I':
			    (ptr->flags & SKIPPED)                              ? 'S':
			    (ptr->flags & EXISTS)                               ? 'E':
			    (ptr->flags & REDIRECTED)                           ? '>':
			    (ptr->flags & CONNECTED)                            ? 'c':
			    (ptr->flags & UNKNOWN)                              ? 'u':'-',

			      (ptr->flags & FAILED)                             ? 'f':'-',

				(ptr->flags & RANDOMSIZE)                       ? '~':
				(ptr->flags & WRONGSIZE)                        ? '%':
				(ptr->flags & HTTPERROR)                        ? '+':
				((ptr->flags & CONNECTED) && (ptr->size == 0))  ? '@':'-',

				  (ptr->flags & DONTPARSE)                      ? 'D':
				  (ptr->flags & NOTPARSED)                      ? 'N':'-',

				     (ptr->flags & REDIRECTEDTO) ? '#' :
				     (ptr->flags & BASEHREF    ) ? '$' :
								   '=' ,

				      ptr->retcode,
					   ptr->level,
						ptr->trys,
						    ptr->size,
							ptr->declared_size,
							    ptr->hrefs,
								 ptr->serial,
								       ptr->parentURL ? ptr->parentURL->serial : 0,
									     ptr->fullName);

		if(ptr->ctype) fprintf(fp, " [%s]\n", ptr->ctype);
		else           fprintf(fp, "\n");
	}
	fflush(fp);
}

/* ____________________________________________________________________________ */

Item *skip_list    = NULL;
Item *match_list[2]  = { NULL, NULL };
Item *nmatch_list[2] = { NULL, NULL };
Item *allowed_host = NULL;
Item *denied_host  = NULL;

void addSkipItem(char *s, int flag){
	Item *newptr;

	newptr = (Item *) calloc(1, sizeof(Item));

	newptr->text     = strdup(s);
	newptr->length   = strlen(s);
	newptr->flags    = flag;
	newptr->is_a_dir = (*strlast(s) == '/' ? TRUE : FALSE);
	/*      /...../dir/     or
		/...../doc.html
	 */

	newptr->next = skip_list;
	skip_list    = newptr;
}
void addMatchPattern(char *s, int set){
	Item *newptr;

	newptr = (Item *) calloc(1, sizeof(Item));

	newptr->text     = strdup(s);
	newptr->length   = strlen(s);
	newptr->flags    = 0;
	newptr->is_a_dir = FALSE;

	newptr->next		= match_list[set];
	match_list[set]		= newptr;
}
void addNMatchPattern(char *s, int set){
	Item *newptr;

	newptr = (Item *) calloc(1, sizeof(Item));

	newptr->text     = strdup(s);
	newptr->length   = strlen(s);
	newptr->flags    = 0;
	newptr->is_a_dir = FALSE;

	newptr->next		= nmatch_list[set];
	nmatch_list[set]	= newptr;
}
void addAllowedHost(char *s){
	Item *newptr;

	newptr = (Item *) calloc(1, sizeof(Item));

	newptr->text     = strdup(s);
	newptr->length   = strlen(s);
	newptr->flags    = 0;
	newptr->is_a_dir = FALSE;

	newptr->next   = allowed_host;
	allowed_host   = newptr;
}
void addDeniedHost(char *s){
	Item *newptr;

	newptr = (Item *) calloc(1, sizeof(Item));

	newptr->text     = strdup(s);
	newptr->length   = strlen(s);
	newptr->flags    = 0;
	newptr->is_a_dir = FALSE;

	newptr->next     = denied_host;
	denied_host      = newptr;
}
Bool hostAllowed(char *name){
	Item *ptr;
	for(ptr=allowed_host; ptr; ptr=ptr->next){
		if(match(name, ptr->text))
			return TRUE;
	}
	return FALSE;
}
Bool hostDenied(char *name){
	Item *ptr;

	for(ptr=denied_host; ptr; ptr=ptr->next){
		if(match(name, ptr->text))
			return TRUE;
	}
	return FALSE;
}

/*
	returns:

		TRUE            -       set flag IGNORED;   do return (skip it)

		FALSE           -       set flag DONTPARSE; continue
				-       no flags;           continue

*/
Bool checkIfToSkip(URL *urlptr){
	Item *ptr;

	/*
	 * Blacklist[0] - skip some items always, like "/forum*"
	 */
	if(nmatch_list[0] != NULL && urlptr != workList.head){
		for(ptr=nmatch_list[0]; ptr; ptr=ptr->next){
			if(match(urlptr->urlName, ptr->text) != 0){
				fprintf(fplog, "\t@@@ Skip: \"%s\" matches blacklist0 \"%s\"\n\n",
					urlptr->urlName, ptr->text);

				urlptr->flags |= IGNORED;
				return TRUE;
			}
		}
	}
	/*
	 * Here we have things that are not excluded by the blacklist[0].
	 * Now, pass only those who are in redlist[0]
	 */
	if(match_list[0] != NULL && urlptr != workList.head){
		for(ptr=match_list[0]; ptr; ptr=ptr->next){
			if(match(urlptr->urlName, ptr->text) != 0){
				fprintf(fplog, "\t@@@ Pass: \"%s\" matches redlist0 \"%s\"\n\n", urlptr->urlName, ptr->text);
				goto pass2;
			}
		}
		fprintf(fplog, "\t@@@ Skip: \"%s\" no match in redlist0\n\n", urlptr->urlName);
		urlptr->flags |= IGNORED;
		return TRUE;
	}
pass2:
	/*
	 * Blacklist[1] - skip some items
	 */
	if(nmatch_list[1] != NULL && urlptr != workList.head){
		for(ptr=nmatch_list[1]; ptr; ptr=ptr->next){
			if(match(urlptr->urlName, ptr->text) != 0){
				fprintf(fplog, "\t@@@ Skip: \"%s\" matches blacklist1 \"%s\"\n\n",
					urlptr->urlName, ptr->text);

				urlptr->flags |= IGNORED;
				return TRUE;
			}
		}
	}
	/*
	 * Here we have things that are not excluded by the blacklist[1].
	 * Now, pass only those who are in redlist[1]
	 */
	if(match_list[1] != NULL && urlptr != workList.head){
		for(ptr=match_list[1]; ptr; ptr=ptr->next){
			if(match(urlptr->urlName, ptr->text) != 0){
				fprintf(fplog, "\t@@@ Pass: \"%s\" matches redlist1 \"%s\"\n\n", urlptr->urlName, ptr->text);
				goto pass;
			}
		}
		fprintf(fplog, "\t@@@ Skip: \"%s\" no match in redlist1\n\n", urlptr->urlName);
		urlptr->flags |= IGNORED;
		return TRUE;
	}
pass:        ;
	/*----------------------------------------------*/

	for(ptr=skip_list; ptr; ptr=ptr->next){

		/* Exact match of a dir name */
		if(ptr->flags & THISONLY){

			if(ptr->is_a_dir == TRUE &&
			   strcmp(urlptr->urlName, ptr->text) == 0)
				goto found;
			else
				continue;
		}

		/* ------------------------------------------- */
		/*
			text:           /AAA/
			urlName:        /AAA/.......
		*/
		if(ptr->is_a_dir == TRUE  &&
		   strncmp(urlptr->urlName, ptr->text, ptr->length) == 0)
			goto found;

		/*
			text:           /AAA/BBB.html
			urlName:        /AAA/BBB.html
		*/
		if(ptr->is_a_dir == FALSE &&
		   strcmp(urlptr->urlName, ptr->text) == 0)
			goto found;

		/*
			text:           /AAA/BBB.html
			urlName:        /AAA/BBB.html#......
		*/
		if(ptr->is_a_dir == FALSE &&
		   strncmp(urlptr->urlName, ptr->text, ptr->length) == 0 &&
		   urlptr->urlName[ptr->length] == '#')
			goto found;

	}
	return FALSE;

found:
	if(ptr->flags & IGNORED){
		fprintf(fplog, "\t@@@ Skipping %s under %s\n\n", urlptr->urlName, ptr->text);
		urlptr->flags |= IGNORED;
		return TRUE;
	}
	if(ptr->flags & DONTPARSE){
		fprintf(fplog, "\t@@@ Don't parse HREFs in %s under %s\n\n", urlptr->urlName, ptr->text);
		urlptr->flags |= DONTPARSE;
		return FALSE;
	}
	return FALSE;
}
/* ____________________________________________________________________________ */

Bool isParent(URL *current, URL *ptr){
	char *currentName;
	char *ptrName;
	char *last;
	int length;

	if(current == NULL || ptr == NULL)
		return FALSE;

	currentName = current->urlName;
	ptrName     = ptr->urlName;

	if(currentName == NULL || ptrName == NULL)
		return FALSE;

	last   = strlast(currentName);
	length = strlen (ptrName);

	if(*last == '/'){
		/* current = "/aaa/bbb/ccc/"
		   Then we must ignore

		   href    = "/aaa/bbb/"
		   href    = "/aaa/"
		   href    = "/"
		*/

		if(
		   /* reference to the directory */
		   *strlast(ptrName) == '/'                      &&

		   /* ptrName is a beginning (prefix) of currentName */
		   strncmp(ptrName, currentName, length) == 0    &&

		   /* currentName is LONGER than ptrName */
		   currentName[length] != '\0'                   &&

		   currentName[length] != '/'

		)  return TRUE;

		return FALSE;

	} else {
		/* current = "/aaa/bbb/ccc/ddd.html"
		   Then we must ignore

		   href    = "/aaa/bbb/"
		   href    = "/aaa/"
		   href    = "/"

		   But accept

		   href    = "/aaa/bbb/ccc/"
		   href    = "/aaa/bbb/xxx....."
		 */

		char savechar;

		last = strrchr(currentName, '/');
		savechar = last[1];
		last[1] = '\0';

		/* current = "/aaa/bbb/ccc/" */

		if(
		   *strlast(ptrName) == '/'                      &&
		   strncmp(ptrName, currentName, length) == 0    &&
		   currentName[length] != '\0'                   &&
		   currentName[length] != '/'
		){
			last[1] = savechar;
			return TRUE;
		}
		last[1] = savechar;
		return FALSE;
	}
}

/* Change %HH to the hex char itself */
void deXXname(char *href){
	char *tmphref;
	char *s, *p;
	int changed = 0;

	if(xxdecodflag == FALSE || strchr(href, '%') == NULL)
		return;

	tmphref = strdup(href);

	for(s=href, p=tmphref; *s;){
		if(s[0] == '%' && isxdigit(s[1]) && isxdigit(s[2])){
			char numb[3];
			int val;

			numb[0] = s[1];
			numb[1] = s[2];
			numb[2] = '\0';

			sscanf(numb, "%X", &val);

			*p++ = val & 0xFF;
			s += 3;
			changed++;

		} else {
			*p++ = *s++;
		}
	}
	*p = '\0';
	if(changed) fprintf(fplog, "\t@@@ HREF decoded: %s to %s\n", href, tmphref);
	strcpy(href, tmphref);
	free(tmphref);
}
/* For http://books.rusf.ru/unzip/add-on/xussr_gk/eskovk25.htm?5/5
 *	change all slashes after the ? to \ codes
 */
void enXXname(char *href){
	int change = 0;
	char *s;

	if(xxencodflag == FALSE || strchr(href, '?') == NULL)
		return;

	for(s=href; *s; s++){
		if(*s == '?')
			change++;
		else if(*s == '/' && change)
			*s = '\\';
	}
}
