+/*
+ * UTF8sanitizer.c - filter out invalid UTF8 bytesequences
+ *
+ * Copyright (C) 2006 Jonas Svensson (jonass@lysator.liu.se)
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation; either version 2 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+
+/* To compile:
+ gcc -O2 -o UTF8sanitizer UTF8sanitizer.c
+
+ Usage (assuming bash):
+ UTF8sanitizer <sourcefile >destfile 2>errors.txt
+*/
+
#include <stdio.h>
int main(int argc, char** argv) {
state = 1;
current_size=0;
current_char=getchar();
- while (!feof(stdin)) { //state != 0) {
+ while (!feof(stdin)) {
if ((current_char & 128) == 0) {
//Handle_ASCII_char();
if (current_char == '\n')