diff --git a/inc/config.php b/inc/config.php index 5926eb1d..ebe17572 100644 --- a/inc/config.php +++ b/inc/config.php @@ -824,6 +824,15 @@ // Set this to true if you're using Linux and you can execute `md5sum` binary. $config['gnu_md5'] = false; + // Use Tesseract OCR to retrieve text from images, so you can use it as a spamfilter. + $config['tesseract_ocr'] = false; + + // Tesseract parameters + $config['tesseract_params'] = ''; + + // Tesseract preprocess command + $config['tesseract_preprocess_command'] = 'convert -monochrome %s -'; + // Number of posts in a "View Last X Posts" page $config['noko50_count'] = 50; // Number of posts a thread needs before it gets a "View Last X Posts" page. @@ -1015,6 +1024,10 @@ // Minify Javascript using http://code.google.com/p/minify/. $config['minify_js'] = false; + // Dispatch thumbnail loading and image configuration with JavaScript. It will need a certain javascript + // code to work. + $config['javascript_image_dispatch'] = false; + /* * ==================== * Video embedding diff --git a/inc/functions.php b/inc/functions.php index 154386b0..5a8f6000 100755 --- a/inc/functions.php +++ b/inc/functions.php @@ -2695,7 +2695,7 @@ function slugify($post) { elseif (isset ($post['body_nomarkup']) && $post['body_nomarkup']) $slug = $post['body_nomarkup']; elseif (isset ($post['body']) && $post['body']) - $slug = strip_html($post['body']); + $slug = strip_tags($post['body']); // Fix UTF-8 first $slug = mb_convert_encoding($slug, "UTF-8", "UTF-8"); diff --git a/post.php b/post.php index 4ea08ce3..bf723e01 100644 --- a/post.php +++ b/post.php @@ -652,14 +652,14 @@ if (isset($_POST['delete'])) { $post['filehash'] = md5($allhashes); } } - + if (!hasPermission($config['mod']['bypass_filters'], $board['uri'])) { - require_once 'inc/filters.php'; - + require_once 'inc/filters.php'; + do_filters($post); } - - if ($post['has_file']) { + + if ($post['has_file']) { foreach ($post['files'] as $key => &$file) { if ($file['is_an_image']) { if ($config['ie_mime_type_detection'] !== false) { @@ -787,6 +787,34 @@ if (isset($_POST['delete'])) { $file['thumbwidth'] = $size[0]; $file['thumbheight'] = $size[1]; } + + if ($config['tesseract_ocr']) { // Let's OCR it! + $fname = $file['tmp_name']; + + if ($file['height'] > 500 || $file['width'] > 500) { + $fname = $file['thumb']; + } + + if ($fname == 'spoiler') { // We don't have that much CPU time, do we? + } + else { + $tmpname = "tmp/tesseract/".rand(0,10000000); + + // Preprocess command is an ImageMagick b/w quantization + $error = shell_exec_error(sprintf($config['tesseract_preprocess_command'], escapeshellarg($fname)) . " | " . + 'tesseract stdin '.escapeshellarg($tmpname).' '.$config['tesseract_params']); + $tmpname .= ".txt"; + + $value = @file_get_contents($tmpname); + @unlink($tmpname); + + if ($value && trim($value)) { + // This one has an effect, that the body is appended to a post body. So you can write a correct + // spamfilter. + $post['body_nomarkup'] .= "".htmlspecialchars($value).""; + } + } + } if (!isset($dont_copy_file) || !$dont_copy_file) { if (isset($file['file_tmp'])) { @@ -827,6 +855,11 @@ if (isset($_POST['delete'])) { } } + // Do filters again if OCRing + if ($config['tesseract_ocr'] && !hasPermission($config['mod']['bypass_filters'], $board['uri'])) { + do_filters($post); + } + if (!hasPermission($config['mod']['postunoriginal'], $board['uri']) && $config['robot_enable'] && checkRobot($post['body_nomarkup'])) { undoImage($post); if ($config['robot_mute']) { diff --git a/tmp/tesseract/.gitkeep b/tmp/tesseract/.gitkeep new file mode 100644 index 00000000..e69de29b