scq/scq.c

// Copyright (C) 2025 Enno Tensing <tenno+scq@suij.in>
// Copyright (C) <2007, 2008> Enno Boland <g s01 de>
// SPDX-FileCopyrightText: (C) 2007 - 2014 Enno Boland <g s01 de>
// SPDX-FileCopyrightText: (C) 2025 Enno Tensing <tenno+scq@suij.in>
// SPDX-License-Identifier: MIT

#define _LARGEFILE64_SOURCE
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <fcntl.h>
#include <errno.h>

#ifndef PACKAGE
#define PACKAGE "scq"
#endif

#ifndef VERSION
#define VERSION "0.0"
#endif

#define CHARWIDTH 4
#define LENGTH(x) sizeof(x) / sizeof(x[0])
#define ADDC(b, i, a)                                                \
	do {                                                         \
		if (i % BUFSIZ == 0) {                               \
			b = realloc(b, (i + BUFSIZ) * sizeof(char)); \
			if (!b) {                                    \
				eprint("Malloc failed.");            \
				return -1;                           \
			}                                            \
		}                                                    \
		b[i] = a;                                            \
	} while (0)

typedef int (*parser)(const char *, const char *, int);
struct tag {
	char *search;
	int process;
	char *before;
	char *after;
};

off64_t get_file_size(const char *);
char *read_file(const char *, off64_t);

static int doamp(const char *, const char *, int);
static int docomment(const char *, const char *, int);
static int dogtlt(const char *, const char *, int);
static int dohtml(const char *, const char *, int);
static int dolineprefix(const char *, const char *, int);
static int dolink(const char *, const char *, int);
static int dolist(const char *, const char *, int);
static int doparagraph(const char *, const char *, int);
static int doreplace(const char *, const char *, int);
static int doshortlink(const char *, const char *, int);
static int dosurround(const char *, const char *, int);
static int dounderline(const char *, const char *, int);
static void *ereallocz(void *, size_t);
static void eprint(const char *, ...);
static void hprint(const char *, const char *);
static int process(const char *, const char *, int);

/* list of parsers */
static parser parsers[] = { dounderline, docomment, dolineprefix, dolist,
			    doparagraph, dogtlt,    dosurround,	  dolink,
			    doshortlink, dohtml,    doamp,	  doreplace };
static int nohtml = 0;

static struct tag lineprefix[] = {
	{ "    ", 0, "<pre><code>", "\n</code></pre>" },
	{ "\t", 0, "<pre><code>", "\n</code></pre>" },
	{ ">", 2, "<blockquote>", "</blockquote>" },
	{ "###### ", 1, "<h6>", "</h6>" },
	{ "##### ", 1, "<h5>", "</h5>" },
	{ "#### ", 1, "<h4>", "</h4>" },
	{ "### ", 1, "<h3>", "</h3>" },
	{ "## ", 1, "<h2>", "</h2>" },
	{ "# ", 1, "<h1>", "</h1>" },
	{ "- - -\n", 1, "<hr />", "" },
};

static struct tag underline[] = {
	{ "=", 1, "<h1>", "</h1>\n" },
	{ "-", 1, "<h2>", "</h2>\n" },
};

static struct tag surround[] = {
	{ "```", 0, "<code>", "</code>" },
	{ "``", 0, "<code>", "</code>" },
	{ "`", 0, "<code>", "</code>" },
	{ "___", 1, "<strong><em>", "</em></strong>" },
	{ "***", 1, "<strong><em>", "</em></strong>" },
	{ "__", 1, "<strong>", "</strong>" },
	{ "**", 1, "<strong>", "</strong>" },
	{ "_", 1, "<em>", "</em>" },
	{ "*", 1, "<em>", "</em>" },
};

static const char *replace[][2] = {
	{ "\\\\", "\\" }, { "\\`", "`" }, { "\\*", "*" }, { "\\_", "_" },
	{ "\\{", "{" },	  { "\\}", "}" }, { "\\[", "[" }, { "\\]", "]" },
	{ "\\(", "(" },	  { "\\)", ")" }, { "\\#", "#" }, { "\\+", "+" },
	{ "\\-", "-" },	  { "\\.", "." }, { "\\!", "!" },
};

static const char *insert[][2] = {
	{ "  \n", "<br />" },
};

off64_t get_file_size(const char *path)
{
	struct stat st;

	if (stat(path, &st) == 0)
		return st.st_size;

	return -1;
}

char *read_file(const char *path, off64_t file_size)
{
	int fd = open(path, O_LARGEFILE | O_NONBLOCK);

	ssize_t bytes;
	char *buf = calloc(file_size + CHARWIDTH, sizeof(char));

	if (!buf) {
		perror(PACKAGE);
		close(fd);
		return NULL;
	}

	bytes = read(fd, buf, file_size);
	if (bytes != file_size) {
		perror(PACKAGE);
		close(fd);
		free(buf);
		return NULL;
	}

	close(fd);

	return buf;
}

void eprint(const char *format, ...)
{
	va_list ap;

	va_start(ap, format);
	vfprintf(stderr, format, ap);
	va_end(ap);
}

int doamp(const char *begin, const char *end, int newblock)
{
	const char *p;

	if (*begin != '&')
		return 0;
	if (!nohtml) {
		for (p = begin + 1; p != end && !strchr("; \\\n\t", *p); p++)
			;
		if (p == end || *p == ';')
			return 0;
	}
	fputs("&amp;", stdout);
	return 1;
}

int dogtlt(const char *begin, const char *end, int newblock)
{
	int brpos;
	char c;

	if (nohtml || begin + 1 >= end)
		return 0;
	brpos = begin[1] == '>';
	if (!brpos && *begin != '<')
		return 0;
	c = begin[brpos ? 0 : 1];
	if (!brpos && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z')) {
		fputs("&lt;", stdout);
		return 1;
	} else if (brpos && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z') &&
		   !strchr("/\"'", c)) {
		fprintf(stdout, "%c&gt;", c);
		return 2;
	}
	return 0;
}

int docomment(const char *begin, const char *end, int newblock)
{
	char *p;

	if (nohtml || strncmp("<!--", begin, 4))
		return 0;
	p = strstr(begin, "-->");
	if (!p || p + 3 >= end)
		return 0;
	fprintf(stdout, "%.*s\n", (int)(p + 3 - begin), begin);
	return (p + 3 - begin) * (newblock ? -1 : 1);
}

int dohtml(const char *begin, const char *end, int newblock)
{
	const char *p;
	const char *tag;
	const char *tagend;

	if (nohtml || begin + 2 >= end)
		return 0;
	p = begin;
	if (p[0] != '<' || !isalpha(p[1]))
		return 0;
	p++;
	tag = p;
	for (; isalnum(*p) && p < end; p++)
		;
	tagend = p;
	if (p > end || tag == tagend)
		return 0;
	while ((p = strstr(p, "</")) && p < end) {
		p += 2;
		if (strncmp(p, tag, tagend - tag) == 0 &&
		    p[tagend - tag] == '>') {
			p++;
			fwrite(begin, sizeof(char),
			       p - begin + tagend - tag - 1, stdout);
			return p - begin + tagend - tag - 1;
		}
	}
	p = strchr(tagend, '>');
	if (p) {
		fwrite(begin, sizeof(char), p - begin + 1, stdout);
		return p - begin + 1;
	} else
		return 0;
}

int dolineprefix(const char *begin, const char *end, int newblock)
{
	unsigned int i, j, l;
	char *buffer;
	const char *p;

	if (newblock)
		p = begin;
	else if (*begin == '\n')
		p = begin + 1;
	else
		return 0;
	for (i = 0; i < LENGTH(lineprefix); i++) {
		l = strlen(lineprefix[i].search);
		if (end - p < l)
			continue;
		if (strncmp(lineprefix[i].search, p, l))
			continue;
		if (*begin == '\n')
			fputc('\n', stdout);
		fputs(lineprefix[i].before, stdout);
		if (lineprefix[i].search[l - 1] == '\n') {
			fputc('\n', stdout);
			return l - 1;
		}
		if (!(buffer = malloc(BUFSIZ)))
			eprint("Malloc failed.");
		buffer[0] = '\0';

		/* Collect lines into buffer while they start with the prefix */
		j = 0;
		while ((strncmp(lineprefix[i].search, p, l) == 0) &&
		       p + l < end) {
			p += l;

			/* Special case for blockquotes: optional space after > */
			if (lineprefix[i].search[0] == '>' && *p == ' ') {
				p++;
			}

			while (p < end) {
				ADDC(buffer, j, *p);
				j++;
				if (*(p++) == '\n')
					break;
			}
		}

		/* Skip empty lines in block */
		while (buffer + j - 1 >= buffer && *(buffer + j - 1) == '\n') {
			j--;
		}

		ADDC(buffer, j, '\0');
		if (lineprefix[i].process)
			process(buffer, buffer + strlen(buffer),
				lineprefix[i].process >= 2);
		else
			hprint(buffer, buffer + strlen(buffer));
		puts(lineprefix[i].after);
		free(buffer);
		return -(p - begin);
	}
	return 0;
}

int dolink(const char *begin, const char *end, int newblock)
{
	int img;
	int len;
	int sep;
	int parens_depth = 1;
	const char *desc;
	const char *link;
	const char *p;
	const char *q;
	const char *descend;
	const char *linkend;
	const char *title = NULL;
	const char *titleend = NULL;

	if (*begin == '[')
		img = 0;
	else if (strncmp(begin, "![", 2) == 0)
		img = 1;
	else
		return 0;
	p = desc = begin + 1 + img;
	if (!(p = strstr(desc, "](")) || p > end)
		return 0;
	for (q = strstr(desc, "!["); q && q < end && q < p;
	     q = strstr(q + 1, "!["))
		if (!(p = strstr(p + 1, "](")) || p > end)
			return 0;
	descend = p;
	link = p + 2;

	/* find end of link while handling nested parens */
	q = link;
	while (parens_depth) {
		if (!(q = strpbrk(q, "()")) || q > end)
			return 0;
		if (*q == '(')
			parens_depth++;
		else
			parens_depth--;
		if (parens_depth && q < end)
			q++;
	}

	if ((p = strpbrk(link, "\"'")) && p < end && q - 1 > p + 1) {
		sep = p[0]; /* separator: can be " or ' */
		title = p + 1;
		/* strip trailing whitespace */
		for (linkend = p; linkend > link && isspace(*(linkend - 1));
		     linkend--)
			;
		for (titleend = q - 1; titleend > title && isspace(*(titleend));
		     titleend--)
			;
		if (*titleend != sep) {
			return 0;
		}
	} else {
		linkend = q;
	}

	/* Links can be given in angular brackets */
	if (*link == '<' && *(linkend - 1) == '>') {
		link++;
		linkend--;
	}

	len = q + 1 - begin;
	if (img) {
		fputs("<img src=\"", stdout);
		hprint(link, linkend);
		fputs("\" alt=\"", stdout);
		hprint(desc, descend);
		fputs("\" ", stdout);
		if (title && titleend) {
			fputs("title=\"", stdout);
			hprint(title, titleend);
			fputs("\" ", stdout);
		}
		fputs("/>", stdout);
	} else {
		fputs("<a href=\"", stdout);
		hprint(link, linkend);
		fputs("\"", stdout);
		if (title && titleend) {
			fputs(" title=\"", stdout);
			hprint(title, titleend);
			fputs("\"", stdout);
		}
		fputs(">", stdout);
		process(desc, descend, 0);
		fputs("</a>", stdout);
	}
	return len;
}

int dolist(const char *begin, const char *end, int newblock)
{
	unsigned int i;
	unsigned int j;
	unsigned int indent;
	unsigned int run;
	unsigned int ul;
	unsigned int isblock;
	const char *p;
	const char *q;
	char *buffer = NULL;
	char marker = 0;

	isblock = 0;
	if (newblock)
		p = begin;
	else if (*begin == '\n')
		p = begin + 1;
	else
		return 0;
	q = p;
	if (*p == '-' || *p == '*' || *p == '+') {
		ul = 1;
		marker = *p;
	} else {
		ul = 0;
		for (; p < end && *p >= '0' && *p <= '9'; p++)
			;
		if (p >= end || *p != '.')
			return 0;
	}
	p++;
	if (p >= end || !(*p == ' ' || *p == '\t'))
		return 0;
	for (p++; p != end && (*p == ' ' || *p == '\t'); p++)
		;
	indent = p - q;
	buffer = ereallocz(buffer, BUFSIZ);
	if (!newblock)
		fputc('\n', stdout);
	fputs(ul ? "<ul>\n" : "<ol>\n", stdout);
	run = 1;
	for (; p < end && run; p++) {
		for (i = 0; p < end && run; p++, i++) {
			if (*p == '\n') {
				if (p + 1 == end)
					break;
				else {
					/* Handle empty lines */
					for (q = p + 1;
					     (*q == ' ' || *q == '\t') &&
					     q < end;
					     q++)
						;
					if (*q == '\n') {
						ADDC(buffer, i, '\0');
						i++;
						run = 0;
						isblock++;
						p = q;
					}
				}
				q = p + 1;
				j = 0;
				if (ul && *q == marker)
					j = 1;
				else if (!ul) {
					for (; q + j != end && q[j] >= '0' &&
					       q[j] <= '9' && j < indent;
					     j++)
						;
					if (q + j == end)
						break;
					if (j > 0 && q[j] == '.')
						j++;
					else
						j = 0;
				}
				if (q + indent < end)
					for (; (q[j] == ' ' || q[j] == '\t') &&
					       j < indent;
					     j++)
						;
				if (j == indent) {
					ADDC(buffer, i, '\n');
					i++;
					p += indent;
					run = 1;
					if (*q == ' ' || *q == '\t')
						p++;
					else
						break;
				} else if (j < indent)
					run = 0;
			}
			ADDC(buffer, i, *p);
		}
		ADDC(buffer, i, '\0');
		fputs("<li>", stdout);
		process(buffer, buffer + i,
			isblock > 1 || (isblock == 1 && run));
		fputs("</li>\n", stdout);
	}
	fputs(ul ? "</ul>\n" : "</ol>\n", stdout);
	free(buffer);
	p--;
	while (*(--p) == '\n')
		;
	return -(p - begin + 1);
}

int doparagraph(const char *begin, const char *end, int newblock)
{
	const char *p;

	if (!newblock)
		return 0;
	p = strstr(begin, "\n\n");
	if (!p || p > end)
		p = end;
	if (p - begin <= 1)
		return 0;
	fputs("<p>", stdout);
	process(begin, p, 0);
	fputs("</p>\n", stdout);
	return -(p - begin);
}

int doreplace(const char *begin, const char *end, int newblock)
{
	unsigned int i;
	unsigned int l;

	for (i = 0; i < LENGTH(insert); i++)
		if (strncmp(insert[i][0], begin, strlen(insert[i][0])) == 0)
			fputs(insert[i][1], stdout);
	for (i = 0; i < LENGTH(replace); i++) {
		l = strlen(replace[i][0]);
		if (end - begin < l)
			continue;
		if (strncmp(replace[i][0], begin, l) == 0) {
			fputs(replace[i][1], stdout);
			return l;
		}
	}
	return 0;
}

int doshortlink(const char *begin, const char *end, int newblock)
{
	const char *p;
	const char *c;
	int ismail = 0;

	if (*begin != '<')
		return 0;
	for (p = begin + 1; p != end; p++) {
		switch (*p) {
		case ' ':
		case '\t':
		case '\n':
			return 0;
		case '#':
		case ':':
			ismail = -1;
			break;
		case '@':
			if (ismail == 0)
				ismail = 1;
			break;
		case '>':
			if (ismail == 0)
				return 0;
			fputs("<a href=\"", stdout);
			if (ismail == 1) {
				/* mailto: */
				fputs("&#x6D;&#x61;i&#x6C;&#x74;&#x6F;:",
				      stdout);
				for (c = begin + 1; *c != '>'; c++)
					fprintf(stdout, "&#%u;", *c);
				fputs("\">", stdout);
				for (c = begin + 1; *c != '>'; c++)
					fprintf(stdout, "&#%u;", *c);
			} else {
				hprint(begin + 1, p);
				fputs("\">", stdout);
				hprint(begin + 1, p);
			}
			fputs("</a>", stdout);
			return p - begin + 1;
		}
	}
	return 0;
}

int dosurround(const char *begin, const char *end, int newblock)
{
	unsigned int i;
	unsigned int l;
	const char *p;
	const char *start;
	const char *stop;

	for (i = 0; i < LENGTH(surround); i++) {
		l = strlen(surround[i].search);
		if (end - begin < 2 * l ||
		    strncmp(begin, surround[i].search, l) != 0)
			continue;
		start = begin + l;
		p = start - 1;
		do {
			stop = p;
			p = strstr(p + 1, surround[i].search);
		} while (p && p[-1] == '\\');
		if (p && p[-1] != '\\')
			stop = p;
		if (!stop || stop < start || stop >= end)
			continue;
		fputs(surround[i].before, stdout);

		/* Single space at start and end are ignored */
		if (stop - start > 1 && *start == ' ' && *(stop - 1) == ' ') {
			start++;
			stop--;
			l++;
		}

		if (surround[i].process)
			process(start, stop, 0);
		else
			hprint(start, stop);
		fputs(surround[i].after, stdout);
		return stop - begin + l;
	}
	return 0;
}

int dounderline(const char *begin, const char *end, int newblock)
{
	unsigned int i;
	unsigned int j;
	unsigned int l;
	const char *p;

	if (!newblock)
		return 0;
	p = begin;
	for (l = 0; p + l + 1 != end && p[l] != '\n'; l++)
		;
	p += l + 1;
	if (l == 0)
		return 0;
	for (i = 0; i < LENGTH(underline); i++) {
		for (j = 0; p + j != end && p[j] != '\n' &&
			    p[j] == underline[i].search[0];
		     j++)
			;
		if (j >= l) {
			fputs(underline[i].before, stdout);
			if (underline[i].process)
				process(begin, begin + l, 0);
			else
				hprint(begin, begin + l);
			fputs(underline[i].after, stdout);
			return -(j + p - begin);
		}
	}
	return 0;
}

void *ereallocz(void *p, size_t size)
{
	void *res;
	if (p)
		res = realloc(p, size);
	else
		res = calloc(1, size);

	if (!res)
		perror(PACKAGE);
	return res;
}

void hprint(const char *begin, const char *end)
{
	const char *p;

	for (p = begin; p != end; p++) {
		if (*p == '&')
			fputs("&amp;", stdout);
		else if (*p == '"')
			fputs("&quot;", stdout);
		else if (*p == '>')
			fputs("&gt;", stdout);
		else if (*p == '<')
			fputs("&lt;", stdout);
		else
			fputc(*p, stdout);
	}
}

int process(const char *begin, const char *end, int newblock)
{
	const char *q;
	const char *p;
	int affected;
	unsigned int i;

	for (p = begin; p < end;) {
		if (newblock)
			while (*p == '\n')
				if (++p == end)
					return 1;
		affected = 0;
		for (i = 0; i < LENGTH(parsers) && !affected; i++)
			affected = parsers[i](p, end, newblock);
		p += abs(affected);
		if (affected == -1)
			return 0;
		if (!affected) {
			if (nohtml)
				hprint(p, p + 1);
			else
				fputc(*p, stdout);
			p++;
		}
		for (q = p; q != end && *q == '\n'; q++)
			;
		if (q == end)
			return 1;
		else if (p[0] == '\n' && p + 1 != end && p[1] == '\n')
			newblock = 1;
		else
			newblock = affected < 0;
	}
	return 1;
}

int main(int argc, char *argv[])
{
	char *buffer = NULL;
	const char *path;
	int i;
	int ret = EXIT_SUCCESS;

	for (i = 1; i < argc; i++) {
		if (!strcmp("-v", argv[i])) {
			eprint("simple markup %s (C) 2007 - 2014 Enno Boland, (C) 2025 Enno Tensing\n", VERSION);
			goto exit;
		} else if (!strcmp("-n", argv[i])) {
			nohtml = 1;
		} else if (argv[i][0] != '-') {
			break;
		} else if (!strcmp("--", argv[i])) {
			i++;
			break;
		} else {
			eprint("Usage %s [-n] [file]\n -n escape html strictly\n",
			       argv[0]);
			goto exit;
		}
	}

	if (i < argc) {
		path = argv[i];

		off64_t len = get_file_size(path);
		if (len == -1) {
			eprint("%s: %s: %s\n", argv[0], path, strerror(errno));
			ret = EXIT_FAILURE;
			goto exit;
		}
		buffer = read_file(path, len);
		if (!buffer) {
			perror(PACKAGE);
			ret = EXIT_FAILURE;
			goto exit;
		}
		buffer[len] = '\0';
		process(buffer, buffer + len, 1);
		free(buffer);
	} else {
		size_t buffer_size = 1024 * CHARWIDTH;
		buffer = calloc(buffer_size + CHARWIDTH, sizeof(char));
		if (!buffer) {
			perror(PACKAGE);
			ret = EXIT_FAILURE;
			goto exit;
		}

		size_t read_bytes;
		while (1) {
			read_bytes = read(STDIN_FILENO, buffer, buffer_size);
			if (read_bytes <= 0) {
				if (errno) {
					perror(PACKAGE);
					ret = EXIT_FAILURE;
				}
				free(buffer);
				break;
			}

			buffer[read_bytes] = '\0';
			if (!process(buffer, buffer + read_bytes, 1)) {
				ret = EXIT_FAILURE;
				free(buffer);
				break;
			}
		}
	}
exit:
	return ret;
}