/*
 * c.eval
 *
 * Evaluate if a url we've found in a page should be fetched or not.
 *
 * $Log: eval,v $
 * Revision 1.12  1998/08/30 18:31:48  joseph
 * Fetching of inline images may be specified more precisely
 *
 * Revision 1.11  1998/08/30 13:09:58  joseph
 * Fetching ordered is now more defined - pages closer to the root, and
 * actual pages (as opposed to inlines) at the same line level are favoured.
 *
 * Revision 1.10  1998/08/10 19:51:01  joseph
 * Removed some of the debugging
 *
 * Revision 1.9  1998/08/10 19:21:08  joseph
 * Added following / rewriting of <meta http-equiv="REFRESH"...> tag
 *
 * Revision 1.8  1998/08/09 15:06:59  joseph
 * Added include/exclude lists for URLs
 *
 * Revision 1.7  1998/06/16 18:29:38  joseph
 * Added recalculation of fetch portion for base href & redirections
 * Added processing of base href in fetch and rewrite
 * Moved file_truncate from rewrite to file
 * Logging of urls with unknown schemes
 * Errorlog file kept open across whole fetch
 * Log routines may handle effects of *close better.
 * Invalid chars in urls and hostnames replaced with ~'s on filesystem
 * Rewrite recovers from zero length log file
 * Removed permfail, softfail states as not used
 * Removed some unused bits from url structures
 * Changed proxy code to handle dotless hostnames okay.
 * Version to 0.06a.
 *
 * Revision 1.6  1998/06/14 21:24:37  joseph
 * Moved relative url resolver from rewrite / http -> misc and rewrote
 * Fixed rewrite leaving []'s all over log file.
 * Fixed misc_urltofilename barfing on url's like http://wiggle
 * Lessened processor time when fetching pages with _lots_ of links in them.
 * Make FetchError open/close at start/end of fetch like FetchLog
 * Fixed handling of ftp urls, with & without proxy set.
 * Fixed bugs in code to set fetchportion
 * URLs now moved to fetch list when fetched (whether sucessful or not)
 * Log file now says '[http]' or '[ftp]' for real link, as appropriate.
 * Recalculates fetch portion for redirected urls
 * Fixed handling of proxy for ftp (wasn't always using proxy)
 * url.c now uses 'times 2' allocator / deallocator.
 * url_handled corrected for ftp (returns true only if proxy set)
 * closes log file when fetch aborted.
 *
 * Revision 1.5  1998/06/06 21:44:55  joseph
 * Added ability to not fetch frames / inline images in fetch / config
 * wimpc_menuread moved to wimpclib
 * Now fades individual fetches when started, unfades all at end of fetch
 *
 * Revision 1.4  1998/05/28 19:41:10  joseph
 * Changed all printf()'s to debug_printf(())'s
 *
 * Revision 1.3  1998/04/05 16:11:41  jogu
 * Fixed up http_softerror() a bit, added processing of <frame src="..">
 *
 * Revision 1.2  1998/04/05 11:33:26  jogu
 * Added fetch portion, changed sense of linkdepth (counts down now)
 *
 * Revision 1.1  1998/03/21 20:30:04  jogu
 * Added evaluation of url's.
 *
 *
 */

#include <stdio.h> /* printf */
#include <string.h>

#include "my_string.h"
#include "debug.h"
#include "fetchstruc.h"
#include "http.h"
#include "misc.h"
#include "ruleset.h"

#include "eval.h"

extern httpfetch_t *httpfetch[];


static int eval_include_exclude( const char *include, const char *exclude, char *url );


static int eval_checkfetchp( const char *tofetch, const char *url, const char *base, int *fetchp )
{
  int basefetchp;

  if ( *fetchp == 0 )
    return 1; /* fetch all URLs */

  /* user only wants certain urls fetching */

  /* if we have a base href, we check both the original url and the base href.
   * We must check base href, as this is where our links are going
   * We must check orig url in case people put inline images with abs. urls
   */
  if ( my_strncaseequ( tofetch, url, *fetchp ) )
  {
    /* main url matches, fetch url */
    return 1;
  }

  if ( !base )
    return 0; /* not BASE HREF, url can't match... */

  basefetchp = http_recalcfetchportion( base, url, *fetchp );
  if ( !my_strncaseequ( tofetch, base, basefetchp ) )
    return 0; /* doesn't match BASE HREF fetchp */

  /* matches - fetch url with mods :) */
  *fetchp = basefetchp;
  return 1;
}

/*
 * to evaluate if a url we want to fetch is either under a host or under a tree we
 * want to fetch, we record the portion of the url we want all others to contain,
 * as an integer offset of the last character we want to match
 * we need to remember to do something very sensible about this when we follow
 * a redirection
 */

int eval_url( int no, char *tofetch, eval_inline inline )
{
  httpfetch_t *x = httpfetch[no]; /* NB. Don't pass x to flex & co */
  const char *url = ((char *) httpfetch[no]) + sizeof(httpfetch_t);
  int newdepth = x->linkdepthtogo;
  int newactdepth = x->act_linkdepth;
  int newfetchp = x->fetchportion;
  int newimagefetchp = x->imagefetchportion;

  if ( inline == inline_image )
  {
    const char *base;
    if ( x->noinlineimages )
      return 0; /* don't fetch - user doesn't want inline images */

    base = x->baseurl ? url + strlen( url ) + 1 : NULL;

    if ( !eval_checkfetchp( tofetch, url, base, &newimagefetchp ) )
      return 0; /* doesn't match imagefetch portion */
  }
  else if ( inline == inline_frame )
  {
    if ( !x->inlineframes )
      return 0; /* don't fetch - user doesn't want inline frames */
  }
  else if ( inline == inline_metarefresh )
  {
    /* fetch always */
  }
  else /* inline_not */
  {
    const char *base;
    if ( newdepth == 0 )
      return 0; /* don't fetch, too deep */

    newdepth--;
    newactdepth++;

    if ( x->baseurl )
      base = url + strlen( url ) + 1;
    else
      base = NULL;

    if ( !eval_checkfetchp( tofetch, url, base, &newfetchp ) )
      return 0; /* doesn't match fetch portion */
  }

  if ( x->ruleset != -1 )
  {
    rule_t *rule = ruleset_recall( x->ruleset );
    if ( rule )
    {
      if ( ! eval_include_exclude( rule->includefiles, rule->excludefiles, tofetch ) )
      {
//        debug_printf(("%d : %s does not satisfy ruleset %d\n", no, tofetch, x->ruleset ));
        return 0;
      }
//      debug_printf(("%d : %s satisfies ruleset %d\n", no, tofetch, x->ruleset ));
    }
    else
    {
      debug_printf(("%d : error looking up ruleset %d\n", no, x->ruleset ));
    }
  }

  /* Otherwise, the url can be fetched */
  url_addto( PENDING, tofetch, newdepth, newfetchp, x->noinlineimages, x->inlineframes,
             x->ruleset, inline, newactdepth, newimagefetchp );

  return 1; /* fetch */
}



/* eval_wildcardmatch
 *
 * Returns 1 if p matches q.
 * p may contain wildcards ( * = any sequence of chars, ? one char
 * extra term is an additional terminator that is applied to p.
 */
static int eval_wildcardmatch( const char *p,const char *q, char extraterm )
{
  for (;;)
  {
    if ( (*p==0 || *p==extraterm) && *q==0)
      return 1; /* end of p & q reached */
    if (*p=='*')
    {
      p++;
      if ( *p==0 || *p==extraterm )
	return 1; /* trailing '*'! */
      while ( *q )
      {
	if ( eval_wildcardmatch( p, q, extraterm ) )
	  return 1;
	q++;
      }
    }
    if ( *p==0 || *p==extraterm || *q==0 ) /* p or q has finished */
      return 0;
    if ( *p=='?' || *p==*q )
    /* if p is ?, always match. otherwise exact match */
    {
      p++, q++;
      continue;
    }
    return 0;
  }
}

/* checks in the filename within url is matches the include or exclude list
 *
 * Returns :
 *  1 -> file matches, should be fetched
 *  0 -> no match, don't fetch
 *
 * include / exclude lists are space seperated (see EXTRATERM)
 */
#define EXTRATERM ' '
static int eval_include_exclude( const char *include, const char *exclude, char *url )
{
//  char *path, *t;
//  url_parsed u;
//  int len;
  const char *match;
  const char *filename = url;

//  char filename[ 256 ];
//
//  /* parse url */
//  u.url = url;
//  misc_parseurl( &u );
//
//  path = u.path.s;
//  len  = u.path.l;
//  while ( t = memchr( path, len, EXTRATERM ), t )
//  {
//    len -= t - path;
//    path = t + 1;
//  }
//
//  debug_printf(( "Filename is %s, length = %d\n", path, len ));
//
//  *filename = 0;
//  if ( len - 1 > sizeof filename ) len = sizeof filename - 1;
//  strncat( filename, path, len );
//
//  debug_printf(( "Filename is %s\n", filename ));

  /* check filename against include, if not found return 0 */
  match = include;
  while (1)
  {
//    debug_printf(( "Checking include : %s\n", match ));
    if ( eval_wildcardmatch( match, filename, EXTRATERM ) )
      break; /* file matches something in include! */
    match = strchr( match, EXTRATERM );
    if ( !match )
      return 0; /* no further elements to match against */
    match++; /* skip seperator */
  }

//  debug_printf(( "Filename matchs includes\n" ));

  /* check filename against exclude, if found return 0 */
  match = exclude;
  while ( 1)
  {
//    debug_printf(( "Checking exclude : %s\n", match ));
    if ( eval_wildcardmatch( match, filename, EXTRATERM ) )
      return 0; /* file matches something in exclude */
    match = strchr( match, EXTRATERM );
    if ( !match )
      break; /* no further elements to match against */
    match++; /* skip seperator */
  }

//  debug_printf(( "Filename doesn't match excludes\n" ));

  /* otherwise, return 1, file should be fetched */
  return 1;
}
