[ Back to the overview Matrix ]

Test case : Tokens using Algol 68

Lines used: 39
# Algol is notoriously difficult to lex/parse properly, being meant for program[mer]s, not compiler writers. This is a 90% solution.

# Known problems -- COMMENT (as opposed to hash-comments like this one) is not treated specially -- screws up big-time if there are unbalanced quote/hash, and little-time otherwise; ditto CO, PRAGMAT, PR. Signs break real numbers, eg 1e+6 is three tokens. Formats are not analysed specially. Compound symbols are not analysed as per the Report. To save a considerable mess, "|" and ":" are treated as [potential] Nomads, meaning they "are" dyadic operators. This means that "|:", ":=", "=:", ":=:" and ":/=:" work, but also means that many illegal sequences count as one token. It would take a language lawyer to determine whether any legal programs are mis-counted. #

BEGIN FILE f;
      VOID (open (f, argv (4 #sic# ), standin channel));
      INT tokens := 0, lines := 0, chars := 0;
      on logical file end (f, (REF FILE f) BOOL:
	  (print ((lines, tokens, chars)); stop));

      STRING s := ""; INT pos := 0, p;
      [] CHAR uc = "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
	      lc = "abcdefghijklmnopqrstuvwxyz",
	      dig = "0123456789",
	      space = "#	 ",	# includes tab, but not CR/LF/PF #
	      alphanum = lc+dig+".",	# overkill? #
	      monad = "&%~+-^",
	      nomad = "<>/=*:|",	# a sort-of lie, see above #
	      dyad = monad+nomad;
      PROC current = CHAR:
	(( pos > UPB s | get (f, (s, newline));
	     pos := 0; lines +:= 1; chars +:= UPB s + 1);
	   pos < LWB s | " " | s[pos]);
      PROC skip = VOID: pos +:= 1;
      PROC next = CHAR: (skip; current);
      PRIO ISIN = 4;
      OP ISIN = (CHAR c, [] CHAR s) BOOL: char in string (c, p, s);
      PROC skip space = VOID:
	WHILE current ISIN space
	   DO ( p = 1 | WHILE next /= "#" DO SKIP OD ); skip OD;
		# skips (some) comments as well #
      PROC next vis = CHAR: (skip; skip space; current);

      PROC next token = VOID:
	( skip space; tokens +:= 1; current = """"
	  | WHILE WHILE next /= """" DO SKIP OD; next = """" DO SKIP OD
	  |: current ISIN uc			# "bold" tag #
	  | WHILE next ISIN uc+dig DO SKIP OD
	  |: current ISIN alphanum		# ident/number #
	  | WHILE next vis ISIN alphanum DO SKIP OD
	  |: current ISIN dyad			# operator, sort-of #
	  | WHILE next ISIN nomad DO SKIP OD
	  | skip);				# other character #

      DO next token OD
END
Contributed by andrew.walker at nottingham.ac.uk