/* misc.c
 *
 * Misc routines for url handling
 *
 * $Log: misc,v $
 * Revision 1.16  2000/02/11 18:56:22  joseph
 * Fixed bug that caused url schemes to resolve incorrectly.
 *
 * Revision 1.15  1999/08/22 12:32:29  joseph
 * Fix for people doing relative urls, but beginning them with a scheme
 *
 * Revision 1.14  1998/12/05 15:46:38  joseph
 * Fixed so that control chars are removed from absolute urls too
 * Converts scheme in urls to lower case
 *
 * Revision 1.13  1998/12/05 13:24:14  joseph
 * Fixed problem parsing <meta http-equiv=fresh url=...> tags
 *
 * Revision 1.12  1998/10/15 20:14:35  joseph
 * Now ignores control characters inside urls
 *
 * Revision 1.11  1998/10/04 12:04:43  joseph
 * Fixed parser to cope with <a href = "..."> (spaces around =)
 *
 * Revision 1.10  1998/08/23 19:26:02  joseph
 * URL prefix is now configurable
 * 'Cannot resolve' error message now include hostname
 * Bug with <a\nhref="..."> fixed
 *
 * Revision 1.9  1998/08/18 21:20:28  joseph
 * Transferred logic for scanning url for tag to misc and made it more
 * tolerant of broken html (missing "s, etc)
 *
 * Revision 1.8  1998/08/12 21:12:25  joseph
 * Fixed for // mapping to .. (an invalid RiscOS filename)
 * Really allowed - and _ in filenames generated from the path section of
 * a URL.
 *
 * Revision 1.7  1998/08/09 16:29:53  joseph
 * Fix for URLs of form http://host not getting saved in correct place.
 *
 * Revision 1.6  1998/08/09 15:43:28  joseph
 * Download filenames may now include - and _
 *
 * Revision 1.5  1998/08/09 15:06:59  joseph
 * Added include/exclude lists for URLs
 *
 * Revision 1.4  1998/07/01 21:10:57  joseph
 * Added status window
 * Fixed persistent connections
 * Fixed magtags to only count processed url references
 * Fixed proxy_forunqualified, again.
 * Fixed leading/trailing spaces in urls...
 *
 * Revision 1.3  1998/06/16 18:29:38  joseph
 * Added recalculation of fetch portion for base href & redirections
 * Added processing of base href in fetch and rewrite
 * Moved file_truncate from rewrite to file
 * Logging of urls with unknown schemes
 * Errorlog file kept open across whole fetch
 * Log routines may handle effects of *close better.
 * Invalid chars in urls and hostnames replaced with ~'s on filesystem
 * Rewrite recovers from zero length log file
 * Removed permfail, softfail states as not used
 * Removed some unused bits from url structures
 * Changed proxy code to handle dotless hostnames okay.
 * Version to 0.06a.
 *
 * Revision 1.2  1998/06/14 21:24:37  joseph
 * Moved relative url resolver from rewrite / http -> misc and rewrote
 * Fixed rewrite leaving []'s all over log file.
 * Fixed misc_urltofilename barfing on url's like http://wiggle
 * Lessened processor time when fetching pages with _lots_ of links in them.
 * Make FetchError open/close at start/end of fetch like FetchLog
 * Fixed handling of ftp urls, with & without proxy set.
 * Fixed bugs in code to set fetchportion
 * URLs now moved to fetch list when fetched (whether sucessful or not)
 * Log file now says '[http]' or '[ftp]' for real link, as appropriate.
 * Recalculates fetch portion for redirected urls
 * Fixed handling of proxy for ftp (wasn't always using proxy)
 * url.c now uses 'times 2' allocator / deallocator.
 * url_handled corrected for ftp (returns true only if proxy set)
 * closes log file when fetch aborted.
 *
 *
 */
#include <string.h>
#include <ctype.h>

#include "defines.h"
#include "swis.h"
#include "debug.h"
#include "url.h"
#include "macros.h"
#include "my_string.h"

#include "misc.h"

#ifndef TEST_CODE




/* returns a pointer to the interesting bit of the tag, if there is one */

char *misc_scantag( char *tag, int len, int *basehref, eval_inline *inline, char *lastchar )
{
  int skip = 0;
  char *lookfrom = NULL;
  char *urlend;

  *inline = inline_not;

  debug_printf(("Searching '%s' for a tag.\n", tag ));
/* checks that a string 'a' of length 'b' starts with 'c' followed by a white space character */
#define my_tagstarts(a,b,c) ( ((b >= (sizeof(c)-1)) ? my_strncaseequ(a,c,sizeof(c)-1) : 0) && isspace(a[sizeof(c)-1]) )

  if ( my_tagstarts(tag,len,"BASE") )
  {
    debug_printf(("Starts BASE\n"));
    lookfrom = my_strcasestr(tag, "HREF");
    *basehref = 1;
    skip = sizeof("HREF") - 1;
  }
  else if ( my_tagstarts(tag,len, "IMG") )
  {
    debug_printf(("Starts IMG\n"));
    /* look for 'SRC' */
    lookfrom = my_strcasestr(tag, "SRC");
    *inline = inline_image;
    skip = sizeof("SRC") - 1;
  }
  else if ( my_tagstarts(tag,len, "BODY") )
  {
    debug_printf(("Starts BODY\n"));
    lookfrom = my_strcasestr(tag, "BACKGROUND");
    *inline = inline_image;
    skip = sizeof("BACKGROUND") - 1;
  }
  else if ( my_tagstarts(tag,len, "META") )
  {
    char *ptr;
    debug_printf(("Starts META\n"));
    if ( ptr = my_strcasestr(tag, "HTTP-EQUIV" ), !ptr )
      return 0;

    /* now look for '=' 'refresh' */
    ptr += sizeof("HTTP-EQUIV") - 1;
    while ( isspace( *ptr ) ) ptr++;
    if ( *ptr != '=' )
      return 0;

    while ( isspace( *ptr ) ) ptr++;
    if ( *ptr == '"' )
    {
      ptr++;
      while ( isspace( *ptr ) ) ptr++;
    }
    if ( ! my_strcasestr(ptr, "REFRESH" ) )
      return 0;
    lookfrom = tag; /* never used, but stops the code returning */
    *inline = inline_metarefresh;
  }
  else if ( my_tagstarts(tag,len, "FRAME") )
  {
    debug_printf(("Starts FRAME\n"));
    /* look for 'SRC' */
    lookfrom = my_strcasestr(tag, "SRC");
    *inline = inline_frame;
    skip = sizeof("SRC") - 1;
  }
  else if ( my_tagstarts(tag,len,"A") )
  {
    debug_printf(("Starts A\n"));
    /* look for 'HREF', cos people do things like <A NAME="xxx" HREF="dgfg"> */
    lookfrom = my_strcasestr(tag, "HREF");
    skip = sizeof("HREF") - 1;
  }
  else if ( my_tagstarts(tag,len,"AREA") )
  {
    /* <MAP><AREA SHAPE="RECT" COORDS="252,0,296,19" HREF="...."></MAP> */
    debug_printf(("Starts AREA\n"));
    lookfrom = my_strcasestr(tag, "HREF");
    skip = sizeof("HREF") - 1;
  }
  else return NULL;

  if ( !lookfrom )
    return NULL;

  if ( skip )
  {
    /* look for the =, skipping over any white space. Skip over any white space after the =, too. */
    lookfrom += skip;
    while ( isspace( *lookfrom ) )
      lookfrom++;
    if ( *lookfrom++ != '=' )
      return 0;
    while ( isspace( *lookfrom ) )
      lookfrom++;
  }
  debug_printf(("Found '%s'\n",lookfrom));

  if ( *inline == inline_metarefresh )
  {
    char *ptr;
    /* tag of form <meta http-equiv="REFRESH" content="12; URL=indexpoo.html"> */
    if ( ptr = my_strcasestr( tag, "content="), !ptr )
      return NULL;
    if ( ptr = my_strcasestr( ptr, "URL="), !ptr )
      return NULL;
    debug_printf(("After finding URL=, we have ptr = '%s'\n",ptr));
    ptr += 4; /* skip URL= */
    lookfrom = ptr;
  }

  /* either lookfrom now points at a ", or it doesn't :) */
  if ( *lookfrom == '\"' )
  {
    if ( urlend = strchr( ++lookfrom,'\"'), !urlend )
      return 0;
  }
  else
  {
    /* hohum. they've missed at least the start ". Look for space, " or end of tag.. */
    urlend = strchr( lookfrom, ' ' );
    if ( !urlend )
      urlend = strchr( lookfrom, '\"' );
    if ( !urlend )
      urlend = lookfrom + strlen( lookfrom );
  }

  if ( lastchar ) *lastchar = *urlend;
  *urlend = 0;

  debug_printf(("Returning '%s'\n",lookfrom ));

  return lookfrom;
}




int misc_urltofilename( const char *url, char *name, const int size )
{
  static char bodge_path[]="/";
  const char *ptr;
  char *out;
  char *end = name + size - 1; /* last poss. position for null terminator */

  /* PLAN:
     If scheme isn't http, add 'scheme/.' ( / illegal in hostnames )
     Add hostname, converting . -> _ ( _ is illegal in hostnames )
     Add '.'
     Add path, converting / -> . and . -> /
     If the url terminates /, add index.html? (CONFIGURABLE)
  */
//  debug_printf(("urltofilename(%s,%s,%d)\n",url,name,size));
  {
    int spare;
    if ( _swix(OS_FSControl, _INR(0,2) | _IN(5) | _OUT(5), 37, SITES_DIR, name, size - 1, &spare) || spare<20 )
    {
      debug_printf(("Could not read SITES_DIR(or too long)!\n"));
      return -1;
    }
/*    debug_printf(("Spare = %d\n",spare)); */
  }
  while ( *url == ' ' ) url++; /* fix for <img src=" http..."> problem */

  strcat(name,".");
  ptr = strchr(url,':');
  if (!ptr)
  {
    debug_printf(("No :, '%s'\n",url));
    return -1; /* permanent fail - could not determine scheme! */
  }

  if ( url_handled( url ) != 0 )
  {
    debug_printf(("Unhandled scheme, '%s'\n",url));
    return -10; /* I do mean -10. This makes rewrite write out the url unchanged */
  }

  if ( memcmp(url,"http", ptr-url) != 0 )
  {
    /* not http! */
    strncat(name, url, ptr-url);
    strcat(name, "/.");
  }
  /* now add hostname.. */
  out = name + strlen(name); /* point to null */
  ptr = strstr(url,"//");
  if (!ptr)
  {
    debug_printf(("No //, '%s'\n",url));
    return -1; /* perm. - url does not contain a // */
  }
  ptr+=2; /* points to first char of hostname */

  while ( *ptr != '/' && *ptr && out < end )
  {
    switch (*ptr)
    {
      case '.':
        *out++ = '_';
        break;
      case ':':
        *out++ = '!';
        break;
      default:
        if ( isalnum( *ptr ) || *ptr=='_' || *ptr=='-' )
          *out++ = *ptr;
        else if ( *ptr >= 32 )
          *out++ = '~';
    }
    ptr++;
  }
  if (out>=end)
  {
    debug_printf(("Filename too long, '%s'\n",url));
    return -1; /* perm. - filename generated is too long */
  }

  *out=0; /* tmp ? */
  if ( !*ptr )
    /* humbug. damn missing paths... */
    ptr = bodge_path;

//  debug_printf(("Filename = '%s'\n", name ));

  /* now copy path... beginning at / */
  while ( *ptr && out < end )
  {
//    debug_printf(("*ptr = %d, out = %p, end = %p\n",*ptr,out,end));
    switch (*ptr)
    {
      case '.':
        *out++ = '/';
        break;
      case '/':
        if ( *(out-1) == '.' )
          *out++ = '~';
        else
          *out++ = '.';
        break;
      default:
        if ( isalnum( *ptr ) || *ptr=='_' || *ptr=='-' )
          *out++ = *ptr;
        else if ( *ptr >= 32 )
          *out++ = '~';
    }
    ptr++;
  }
  if (out>=end)
  {
    debug_printf(("Filename too long, '%s'\n",url));
    return -1; /* perm. - filename generated is too long */
  }
  *out=0;
  if ( *(out-1) == '.')
  {
    strncat(out,"index/html", FILEPATH_MAXLEN - (strlen(out)+1) );
    {
      int CONFIG; /* should be configurable */
    }
  }

  return 0;
}


#endif

/* http_resolveurl(url, reference)
   if reference begins http: / ftp: / etc, it's an absolute.
   if reference begins /, it's on this server, but from the root.
   while begins ../ strip up a '/'
   otherwise, it's relative, add to the current url.
   (after stripping back current url to it's final '/')
 */

/* Examples:

  URL                     Relative ref       Resolves to
  http://blue/blob/	  /		     http://blue/
  http://blue/blob/	  ../		     http://blue/
  http://blue/blob/	  index.html	     http://blue/index.html
  http://blue/index.html  ../		     http://blue/
  http://blue/index.html  moo.html	     http://blue/moo.html
  http://blue/index.html  /moo.html	     http://blue/moo.html
*/

/*
 * This code follows RFC1808, Section 4 : Resolving relative URLs
 * NB. resultsize is in characters - ie. sizeof(result)-1
 * Note : Our parsing of the base url is does not follow the rfc.
 *
 * Returns:
 *   -1 : error
 *    0 : successful
 *    1 : maps to source url
 *
 * Full URL format is :
 * scheme : // netloc / rel_path ; params ? query # fragment
 *
 */





static int misc_urltostring( url_parsed *u, char *result, int resultsize )
{
  int len = u->scheme.l + u->netloc.l + u->fragment.l + u->query.l + u->parameters.l + u->path.l;

  if ( len + 6 > resultsize ) return -1;

  if ( ! u->scheme.s || !u->netloc.s )
    return -1; /* scheme, hostname or path missing */

  *result = 0;
  strncat( result, u->scheme.s, u->scheme.l );
  strcat( result, "://" );
  strncat( result, u->netloc.s, u->netloc.l );

  if ( u->path.s )
    strncat( result, u->path.s, u->path.l );

  if ( u->parameters.s )
  {
    strcat( result, ";" );
    strncat( result, u->parameters.s, u->parameters.l );
  }

  if ( u->query.s )
  {
    strcat( result, "?" );
    strncat( result, u->query.s, u->query.l );
  }

  if ( u->fragment.s )
  {
    strcat( result, "#" );
    strncat( result, u->fragment.s, u->fragment.l );
  }

  return 0;
}

/* misc_parseurl
 *
 * This code follows the method described in RFC1808, Section '2.4.  Parsing a URL'
 */
void misc_parseurl( url_parsed *u )
{
  char *url    = u->url;
  char *urlend = url + strlen(url); /* one after last valid char in url not discarded */
  char *ptr;   /* temporary variable */

  memset( u, 0, sizeof(url_parsed) ); /* clear out structure */
  u->url = url;

  u->fragment.s = strchr( url, '#' );
  if ( u->fragment.s )
  {
    urlend = u->fragment.s++;
    u->fragment.l = strlen( u->fragment.s );
  }

  /* check for scheme */
  for ( ptr = url; isscheme(*ptr) && ptr < urlend; ptr++ );
  if ( ptr != url && *ptr == ':' )
  {
    char *x;
    for ( x = url; x < ptr; *x = tolower( *x ), x++ ); /* lower case scheme */
    u->scheme.s = url;
    u->scheme.l = ptr - url;
    url         = ptr + 1;
  }

  if ( url[0] == '/' && url[1] == '/' )
  {
    u->netloc.s = url + 2;
    ptr = strchr( u->netloc.s, '/' );
    if ( ptr )
      url = ptr; /* point to / at end of netloc */
    else
      url += strlen( url ); /* point to null */
    if ( url > urlend ) url = urlend;
    u->netloc.l = url - u->netloc.s;
  }

  u->query.s = strchr( url, '?' );
  if ( u->query.s >= urlend ) u->query.s = NULL;
  if ( u->query.s )
  {
    u->query.l = urlend - u->query.s - 1; /* don't include '?' */
    urlend = u->query.s++; /* skip over '?', leaving urlend pointing at it */
  }

  u->parameters.s = strchr( url, ';' );
  if ( u->parameters.s >= urlend ) u->parameters.s = NULL;
  if ( u->parameters.s )
  {
    u->parameters.l = urlend - u->parameters.s - 1; /* don't include ';' */
    urlend = u->parameters.s++; /* skip over ';' */
  }

  u->path.s = url;
  u->path.l = urlend - url;

  #define CHECK( element ) if ( element.l == 0 ) element.s = NULL;
  CHECK( u->scheme );
  CHECK( u->path );
  CHECK( u->netloc );
  CHECK( u->fragment );
  CHECK( u->query );
  CHECK( u->parameters );
  #undef CHECK

//  #define OUTPUT(name, element) \
//  { \
//    if ( element.s ) element.s[element.l]=0; \
//    printf( "%s = '%s', %d\n", name, element.s ? element.s : "(null)", element.l); \
//  }
//
//  OUTPUT( "Scheme", u->scheme );
//  OUTPUT( "path", u->path);
//  OUTPUT( "Netloc", u->netloc );
//  OUTPUT( "fragment", u->fragment );
//  OUTPUT( "query", u->query );
//  OUTPUT( "parameters", u->parameters);

}

/* misc_resolverelativeurl()
 *
 * This code follows RFC1808, Section 4 : Resolving relative URLs
 * NB. resultsize is in characters - ie. sizeof(result)-1
 * Note : Our parsing of the base url is does not follow the rfc.
 *
 * Returns:
 *   -1 : error
 *    0 : successful
 *    1 : maps to source url
 *
 * Full URL format is :
 * scheme : // netloc / rel_path ; params ? query # fragment
 *
 */

char *misc_resolverelativeurl( char *baseurl, char *embeddedurl, char *result, int resultsize )
{
  url_parsed base, embedded;
  char path[URL_MAXLEN], *ptr;
  base.url = baseurl;
  embedded.url = embeddedurl;

  while ( isspace(*embedded.url) ) embedded.url++; /* skip over spaces in <img src=" <url>"> */

  misc_parseurl( &base );
  misc_parseurl( &embedded );

  /* remove any control codes in the path! */
  for ( ptr = embedded.url; *ptr; ptr++ )
    if ( *ptr < 32 )
      memmove( ptr, ptr + 1, strlen( ptr + 1 ) + 1 );

  /* Step 1 */
  if ( !*baseurl )
    return embeddedurl;

  /* Step 2 a) */
  if ( !*embeddedurl )
    return baseurl;

  /* Step 2 b) */
/*  if ( embedded.scheme.l != 0 )
      return embeddedurl; */ /* Doing this stops http:/ruffle.html working */

  /* Step 2 c) */
  if ( embedded.scheme.l == 0 )
    embedded.scheme = base.scheme;

  /* Step 3 */
  if ( embedded.netloc.l != 0 )
    goto step7;
  embedded.netloc = base.netloc;

  /* Step 4 */
  if ( embedded.path.s && embedded.path.s[0] == '/' )
    goto step7; /* absolute path already in embedded url */

  /* Step 5 */
  if ( embedded.path.l == 0 )
  {
    /* No path in embedded url */
    embedded.path = base.path;
    /* Step 5 a) */
    if ( embedded.parameters.l != 0 )
      goto step7;

    embedded.parameters = base.parameters;

    /* Step 5 b) */
    if ( embedded.query.l != 0 )
      goto step7;

    embedded.query = base.query;
    goto step7;
  }

  /* Step 6 - relative path in embedded url */
  if ( base.path.l )
  {
    *path = 0;
    if ( base.path.l > sizeof(path) - 1 )
      return NULL; /* path too big */
    strncat( path, base.path.s, base.path.l );
  }
  else
  {
    strcpy( path, "/" );
  }
  ptr = strrchr( path, '/' );
  if ( ptr )
    ptr[1] = 0; /* remove anything after the / */
  /* Append path from embedded url */
  if ( embedded.path.l + strlen( path ) + 1 > sizeof(path) )
    return NULL; /* would overflow */
  strncat( path, embedded.path.s, embedded.path.l );

  /* Step 6 a) - remove all occurrences of "./" where "." is a complete path segment */
  while ( ptr = strstr( path, "/./" ), ptr )
    memmove( ptr, ptr + 2, strlen( ptr + 2 ) + 1 );

  /* Step 6 b) - if path ends with "." as a complete path segment, remove '.' */
  if ( path[strlen(path) - 1] == '.' && path[strlen(path) - 2] == '/' )
    path[ strlen(path) - 1 ] = 0;

  /* Step 6 c) - all occurences of <segment>/../ where <segment>!=".." are removed */
  ptr = path + 1; /* missing initial /, as there can't be a segment before it... */
  while ( ptr = strstr( ptr, "/../" ), ptr )
  {
    char *prevstroke = ptr - 1;
    while ( *prevstroke != '/' && prevstroke > path ) prevstroke--; /* find start of previous segment */
    if ( prevstroke && strncmp( prevstroke, "/../", 4 ) != 0 )
    {
      /* previous segment isn't /../ - remove it*/
      memmove( prevstroke + 1, ptr + 4, strlen( ptr + 4 ) + 1 );
      ptr = prevstroke; /* move to where the point we've searched upto has ended up */
      if ( ptr <= path ) ptr = path + 1; /* don't end up searching before start of string! */
    }
    else
      ptr += 3; /* skip over /.. */
  }

  /* Step 6 d) - if path ends with "segment/.." where segment is a complete path segment != "..", remove it */
  if ( strcmp( path + strlen(path) - 3, "/.." ) == 0 )
  {
    char *prevstroke = path + strlen( path ) - 4; /* just before / of /.. */
    while ( *prevstroke != '/' && prevstroke > path ) prevstroke--; /* find start of previous segment */
    if ( prevstroke && strncmp( prevstroke, "/../", 4 ) != 0 )
      /* previous segment isn't /../ - remove it*/
      prevstroke[1] = 0; /* truncate after previous / */
  }
  embedded.path.s = path;
  embedded.path.l = strlen( path );

  /* Note: this goes totally against the RFC, but we now remove all /../'s left. */
  ptr = path;
  while ( ptr = strstr( ptr, "/../"), ptr )
    memmove( ptr, ptr + 3, strlen( ptr + 3 ) + 1 );

  step7: /* reform url */

//   #define OUTPUT(name, element) \
//   { \
//     char c = 0;      \
//     if ( element.s ) { c = element.s[element.l]; element.s[element.l]=0; } \
//     printf( "%s = '%s', %d\n", name, element.s ? element.s : "(null)", element.l); \
//     if ( c ) element.s[element.l]=c; \
//   }
//
//   OUTPUT( "Scheme    ",  embedded.scheme     );
//   OUTPUT( "path      ",  embedded.path       );
//   OUTPUT( "Netloc    ",  embedded.netloc     );
//   OUTPUT( "fragment  ",  embedded.fragment   );
//   OUTPUT( "query     ",  embedded.query      );
//   OUTPUT( "parameters",  embedded.parameters );

  if ( misc_urltostring( &embedded, result, resultsize ) < 0 )
    return NULL;

  return result;
}


#ifdef TEST_CODE

#include <stdio.h>

int main( int argc, char *argv[] )
{
  FILE *f;
  char buffer[160];
  char result[1024];

  if ( argc != 2 )
  {
    printf("Syntax : misc <filename>\n");
    return 1;
  }

  f = fopen( argv[1], "r" );

  if ( !f )
  {
    printf("Could not open file '%s'\n",argv[1]);
    return 1;
  }

  while ( fgets( buffer, sizeof(buffer)-1, f ) )
  {
    char *baseurl = "http://a/b/c/d;p?q#f";
    char *ptr = strchr( buffer , '\t' );
    char *cr = strchr( buffer, '\n' );
    if ( !ptr ) continue;
    if ( cr ) *cr = 0;

    *ptr++ = 0;

    printf( "Input           = '%s'\n", buffer );
    cr = misc_resolverelativeurl( baseurl, buffer, result, sizeof(result)-1 );
    if ( !cr )
    {
      printf( "Output          = [failed]\n" );
      printf( "Expected Output = '%s'\n\n\n", ptr );
      return 0;
    }
    else
    {
      printf( "Output          = '%s'\n", cr );
      printf( "Match           = '%s'\n", strcmp( cr, ptr ) == 0 ? "**Yes**" : "__No__" );
      if ( strcmp( cr, ptr ) )
      {
        printf( "Expected Output = '%s'\n\n\n", ptr );
        return 0;
      }
    }

  }

  fclose( f );

  return 0;
}

#endif
