/*
 * pipe - output HTML/XML in canonical ("sgmls") form
 *
 * Parse HTML/XML and output in approximate "nsgmls" format. Some of
 * the differences are that comments are also printed (see * below),
 * that implied attributes are not, and that entities are left
 * unexpanded. Use "unent" to expand entities to UTF-8.
 *
 * The program doesn't interpret the source in any way, and doesn't
 * read DTDs. That means that, e.g., end tags are not automatically
 * added. Pipe the source through normalize(1) first in order to
 * convert HTML to XML and infer missing tags.
 *
 * The possible command characters and arguments are as follows:
 *
 *     (gi
 *
 *	    The start of an element whose generic identifier is gi.
 *	    Any attributes for this element will have been speci- fied
 *	    with A commands.
 *
 *     )gi
 *
 *	    The end of an element whose generic identifier is gi.
 *
 *     |gi
 *
 *	    An empty element (an element whose tag in the source ended
 *	    with a slash). Any attributes will have been specified
 *	    with A commands. (Note that this distinguishes empty
 *	    elements from elements that happen to have no content,
 *	    even though XML doesn't.)
 *
 *     -data
 *
 *	    Data.
 *
 *     ?pi
 *
 *	    A processing instruction with data pi.
 *
 *     *comment
 *
 *	    A comment
 *
 *     Aname type val
 *
 *	    The next element to start has an attribute name with value
 *	    val and type type. Implied attribute are not shown. All
 *	    attributes are assumed to be of type CDATA, because pipe
 *	    doesn't read DTDs. The exceptions are "xml:id" and
 *	    "xmlid", which are assumed to be of type TOKEN.
 *
 *     !root "fpi" url
 *     !root "fpi"
 *     !root "" url
 *
 *	    A document type declaration. The fpi (public identifier)
 *	    is a quoted string. If there is no fpi, the string is
 *	    empty: "". If there is no url, itis omitted.
 *
 *     Llineno
 *
 *	    Set the current line number. This will be output only if
 *	    the -l option has been given.
 *
 * Part of HTML-XML-utils, see:
 * http://www.w3.org/Tools/HTML-XML-utils/
 *
 * Copyright © 1994-2012 World Wide Web Consortium
 * See http://www.w3.org/Consortium/Legal/copyright-software
 *
 * Author: Bert Bos
 * Created: 2 Dec 1998
 * Version: $Id: hxpipe.c,v 1.14 2025/02/16 01:54:10 bbos Exp $
 *
 **/
#include "config.h"
#include <stdio.h>
#ifdef HAVE_UNISTD_H
# include <unistd.h>
#endif
#include <ctype.h>
#if STDC_HEADERS
# include <string.h>
#else
# ifndef HAVE_STRCHR
#  define strchr index
#  define strrchr rindex
# endif
#endif
#include <stdlib.h>
#include <assert.h>
#include <err.h>
#include <sysexits.h>
#include <stdbool.h>
#include "export.h"
#include "heap.e"
#include "types.e"
#include "tree.e"
#include "html.e"
#include "scan.e"
#include "dict.e"
#include "openurl.e"

#define XMLID "{http://www.w3.org/XML/1998/namespace}id"

static bool has_error = false;
static bool in_text = false;
static bool linenumbering = false;
static bool input_is_html = false;
static Tree tree;


/* escape -- print a string with certain characters escaped */
static void escape(const string t)
{
  string s;

  for (s = t; *s; s++)
    switch (*s) {
      case '\r': printf("\\r"); break;
      case '\t': printf("\\t"); break;
      case '\n': printf("\\n"); break;
      case '\\': printf("\\\\"); break;
      case '&': if (*(s+1) == '#') printf("\\"); else printf("&"); break;
      default: putchar(*s);
    }
}


/* print_attrs -- print attributes */
void print_attrs(const pairlist attribs)
{
  pairlist p;

  for (p = pairlist_sort(attribs); p; p = p->next) {
    putchar('A');
    printf("%s", p->name);
    if (eq(p->name, "xmlid") || eq(p->name, "xml:id") ||
	eq(p->name, XMLID)) printf(" TOKEN ");
    else printf(" CDATA ");
    if (p->value) escape(p->value); else printf("%s", p->name);
    putchar('\n');
  }
}


/* output -- output children of an HTML element (or root) in canonical form */
static void output(Tree t)
{
  Tree n;

  assert(t->tp == Root || t->tp == Element);
  for (n = t->children; n != NULL; n = n->sister)
    switch (n->tp) {
    case Text:
      putchar('-');
      escape(n->text);
      putchar('\n');
      break;
    case Comment:
      putchar('*');
      escape(n->text);
      putchar('\n');
      break;
    case Declaration:
      printf("!%s \"%s\" %s\n", n->name, n->text ? n->text : "",
	n->url ? n->url : "");
      break;
    case Procins:
      putchar('?');
      escape(n->text);
      putchar('\n');
      break;
    case Element:
      print_attrs(n->attribs);
      if (is_empty(n->name)) {
	assert(n->children == NULL);
	putchar('|');
	printf("%s", n->name);
	putchar('\n');
      } else {
	putchar('(');
	printf("%s", n->name);
	putchar('\n');
	output(n);
	putchar(')');
	printf("%s", n->name);
	putchar('\n');
      }
      break;
    default:
      assert(!"Cannot happen");
    }
}


/* combine_text_nodes -- merge all adjacent text nodes into one */
static void combine_text_nodes(Tree t)
{
  Tree n, h;

  assert(t->tp == Root || t->tp == Element);
  for (n = t->children; n != NULL; n = n->sister)
    if (n->tp == Element)
      combine_text_nodes(n);
    else if (n->tp == Text)
      while (n->sister && n->sister->tp == Text) {
	h = n->sister;
	strapp(&n->text, h->text, NULL);
	n->sister = h->sister;
	dispose(h->text);
	dispose(h);
      }
}


/* --------------- implements interface api.h -------------------------- */

/* handle_error -- called when a parse error occurred */
void handle_error(void *clientdata, const string s, int lineno)
{
  fprintf(stderr, "%d: %s\n", lineno, s);
  has_error = true;
}

/* html_start -- called before the first event is reported (HTML mode) */
void* html_start(void)
{
  tree = create();
  return NULL;
}

/* start -- called before the first event is reported (XML mode) */
void* start(void)
{
  return NULL;
}

/* html_end -- called after the last event is reported (HTML mode) */
void html_end(void *clientdata)
{
  tree = get_root(tree);
  combine_text_nodes(tree);
  output(tree);
}

/* end -- called after the last event is reported (XML mode) */
void end(void *clientdata)
{
  if (in_text) {putchar('\n'); in_text = false;}
}

/* handle_html_comment -- called after a comment is parsed (HTML mode) */
void handle_html_comment(void *clientdata, string commenttext)
{
  tree = append_comment(tree, commenttext);
}

/* handle_comment -- called after a comment is parsed (XML mode) */
void handle_comment(void *clientdata, string commenttext)
{
  if (in_text) {putchar('\n'); in_text = false;}
  if (linenumbering) printf("L%d\n", lineno);
  putchar('*');
  escape(commenttext);
  putchar('\n');
}

/* handle_html_text -- called after a text chunk is parsed (HTML mode) */
void handle_html_text(void *clientdata, string text)
{
  tree = append_text(tree, text);
}

/* handle_text -- called after a text chunk is parsed (XML mode) */
void handle_text(void *clientdata, string text)
{
  /* There may be several consecutive calls to this routine. The
   * variable 'in_text' is used to put the text of all of them on the
   * same line.
   **/
  if (! in_text) {
    if (linenumbering) printf("L%d\n", lineno);
    putchar('-');
    in_text = true;
  }
  escape(text);
}

/* handle_html_decl -- called after a declaration is parsed (HTML mode) */
void handle_html_decl(void *clientdata, string gi,
		 string fpi, string url)
{
  tree = append_declaration(tree, gi, fpi, url);
}

/* handle_decl -- called after a declaration is parsed (XML mode) */
void handle_decl(void *clientdata, string gi, string fpi,
		 string url)
{
  if (in_text) {putchar('\n'); in_text = false;}
  if (linenumbering) printf("L%d\n", lineno);
  printf("!%s \"%s\" %s\n", gi, fpi ? fpi : "", url ? url : "");
}

/* handle_html_pi -- called after a PI is parsed (HTML mode) */
void handle_html_pi(void *clientdata, string pi_text)
{
  tree = append_procins(tree, pi_text);
}

/* handle_pi -- called after a PI is parsed */
void handle_pi(void *clientdata, string pi_text)
{
  if (in_text) {putchar('\n'); in_text = false;}
  if (linenumbering) printf("L%d\n", lineno);
  putchar('?');
  escape(pi_text);
  putchar('\n');
}

/* handle_html_starttag -- called after a start tag is parsed (HTML mode) */
void handle_html_starttag(void *clientdata, string name, pairlist attribs)
{
  tree = html_push(tree, name, attribs);
  free(name);
}

/* handle_starttag -- called after a start tag is parsed (XML mode) */
void handle_starttag(void *clientdata, string name, pairlist attribs)
{
  if (in_text) {putchar('\n'); in_text = false;}
  print_attrs(attribs);
  if (linenumbering) printf("L%d\n", lineno);
  putchar('(');
  printf("%s", name);
  putchar('\n');
}

/* handle_html_emptytag -- called after an empty tag is parsed (HTML mode) */
void handle_html_emptytag(void *clientdata, string name, pairlist attribs)
{
  tree = html_push(tree, name, attribs);
  free(name);
}

/* handle_emptytag -- called after an empty tag is parsed (XML mode) */
void handle_emptytag(void *clientdata, string name, pairlist attribs)
{
  if (in_text) {putchar('\n'); in_text = false;}
  print_attrs(attribs);
  if (linenumbering) printf("L%d\n", lineno);
  putchar('|');
  printf("%s", name);
  putchar('\n');
}

/* handle_html_endtag -- called after an endtag is parsed (name may be "") */
void handle_html_endtag(void *clientdata, string name)
{
  tree = html_pop(tree, name);
  free(name);
}

/* handle_endtag -- called after an endtag is parsed (name may be "") */
void handle_endtag(void *clientdata, string name)
{
  if (in_text) {putchar('\n'); in_text = false;}
  if (linenumbering) printf("L%d\n", lineno);
  putchar(')');
  printf("%s", name);
  putchar('\n');
}

/* --------------------------------------------------------------------- */

/* usage -- print usage message and exit */
static void usage(string prog)
{
  fprintf(stderr, "Usage: %s [-l] [-v] [html-file-or-url]\n", prog);
  exit(2);
}

int main(int argc, char *argv[])
{
  int c, status = 200;

  /* Parse command line arguments */
  while ((c = getopt(argc, argv, "Hlv")) != -1)
    switch (c) {
    case 'H': input_is_html = true; break;
    case 'l': linenumbering = true; break;
    case 'v': printf("Version: %s %s\n", PACKAGE, VERSION); return 0;
    case '?': usage(argv[0]); break;
    default: assert(!"Cannot happen");
    }

  if (optind == argc) yyin = stdin;
  else if (optind == argc - 1 && eq(argv[optind], "-")) yyin = stdin;
  else if (optind == argc - 1) yyin = fopenurl(argv[optind], "r", &status);
  else usage(argv[0]);

  if (yyin == NULL) err(EX_IOERR, "%s", argv[optind]);
  if (status != 200) errx(EX_IOERR,"%s: %s",argv[optind],http_strerror(status));

  /* Bind the parser callback routines to our handlers */
  if (!input_is_html) {		/* Input is XML */
    set_error_handler(handle_error);
    set_start_handler(start);
    set_end_handler(end);
    set_comment_handler(handle_comment);
    set_text_handler(handle_text);
    set_decl_handler(handle_decl);
    set_pi_handler(handle_pi);
    set_starttag_handler(handle_starttag);
    set_emptytag_handler(handle_emptytag);
    set_endtag_handler(handle_endtag);
  } else {			/* Input is HTML */
    set_error_handler(handle_error);
    set_start_handler(html_start);
    set_end_handler(html_end);
    set_comment_handler(handle_html_comment);
    set_text_handler(handle_html_text);
    set_decl_handler(handle_html_decl);
    set_pi_handler(handle_html_pi);
    set_starttag_handler(handle_html_starttag);
    set_emptytag_handler(handle_html_emptytag);
    set_endtag_handler(handle_html_endtag);
  }

  if (yyparse() != 0) exit(3);

  return has_error ? 1 : 0;
}
