Remove pre-processing, use newer model trained on no-preprocessing

9 months ago · e44c515780
5 changed files with 429 additions and 402 deletions
--- a/model_gen/captcha_ocr.ipynb
+++ b/model_gen/captcha_ocr.ipynb
--- a/src/group1-shard1of1.bin
+++ b/src/group1-shard1of1.bin
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f673767adc061648864cf142f2854df49c04ba0f717da152130cb007627e150d
-size 1863520
+oid sha256:8e7624ac711c06c8801ae29cd527bf908c9a9c94572d969aa297a7788fc8b6b3
+size 1863004
--- a/src/main.ts
+++ b/src/main.ts
@ -1,9 +1,8 @@
 import * as tf from '@tensorflow/tfjs';
 import { setWasmPaths } from '@tensorflow/tfjs-backend-wasm';
 import modelJSON from './model.json';
-import * as ccl from './ccl';

-const charset = [' ', '0', '2', '4', '5', '8', 'A', 'D', 'G', 'H', 'J', 'K', 'M', 'N', 'P', 'R', 'S', 'T', 'V', 'W', 'X', 'Y'];
+const charset = [' ', '0', '2', '4', '8', 'A', 'D', 'G', 'H', 'J', 'K', 'M', 'N', 'P', 'R', 'S', 'T', 'V', 'W', 'X', 'Y'];
 let weightsData: Uint8Array; // base64 encoded weights
 let model: tf.LayersModel;

@ -269,11 +268,16 @@ async function imageFromCanvas(img: HTMLImageElement, bg: HTMLImageElement, off:
  const scale = th / h;

  const canvas = document.createElement('canvas');
+  const fcanvas = document.createElement('canvas');
  const cw = w * scale + pw * 2;
  canvas.width = cw >= 300 ? 300 : cw;
  canvas.height = th;

-  const ctx = canvas.getContext('2d', { willReadFrequently: true })!;
+  fcanvas.width = 300;
+  fcanvas.height = 80;
+
+  const ctx = canvas.getContext('2d')!;
+  const fctx = fcanvas.getContext('2d')!; // used to contain the captcha stretched to 300w

  ctx.fillStyle = 'rgb(238,238,238)';
  ctx.fillRect(0, 0, canvas.width, canvas.height);
@ -296,96 +300,26 @@ async function imageFromCanvas(img: HTMLImageElement, bg: HTMLImageElement, off:
      );
    }
    ctx.drawImage(img, -w / 2, -h / 2, w, h);
+    fctx.drawImage(canvas, 0, 0, 300, 80);
  };

  if (bg && off == null) {
    off = await slideCaptcha(document.getElementById('t-fg')!, document.getElementById('t-bg')!, document.getElementById('t-slider') as HTMLInputElement);
  }
  draw(off || 0);
-  return ctx.getImageData(0, 0, canvas.width, canvas.height);
-}
-
-// for debugging purposes
-function imagedataToImage(imagedata: ImageData) {
-  const canvas = document.createElement('canvas');
-  const ctx = canvas.getContext('2d')!;
-  canvas.width = imagedata.width;
-  canvas.height = imagedata.height;
-  ctx.putImageData(imagedata, 0, 0);
-
-  const image = new Image();
-  image.src = canvas.toDataURL();
-  return image;
+  return fctx.getImageData(0, 0, 300, 80);
 }

-function toMonochrome(px: Uint8ClampedArray) {
+function toMonochromeFloat(px: Uint8ClampedArray) {
  const ret = Array<number>(px.length >> 2);
  for (let i = 0; i < px.length; i += 4) {
-    ret[i >> 2] = +(px[i] < 128);
+    ret[i >> 2] = px[i] / 255;
  }
  return ret;
 }

 const greedyCTCDecode = (yPred: tf.Tensor<tf.Rank>) => tf.tidy(() => yPred.argMax(-1).arraySync());

-function imgDisp(pixConv: (f: ArrayLike<number>, w: number, h: number, s: Uint8ClampedArray) => ArrayLike<number>, img: ArrayLike<number>, w: number, h: number, t?: boolean) {
-  const dt = new ImageData(w, h);
-  const rgba = pixConv(img, w, h, dt.data);
-  const imgres = imagedataToImage(dt);
-  document.body.appendChild(imgres);
-  if (t) {
-    imgres.style.transform = 'rotate(90deg) scaleY(-1)';
-  }
-}
-
-let colors = [
-  [255, 0, 0],   // Red
-  [0, 255, 0],   // Green
-  [0, 0, 255],   // Blue
-  [255, 255, 0], // Yellow
-  [255, 0, 255], // Magenta
-  [0, 255, 255], // Cyan
-  [128, 0, 0],   // Dark Red
-  [0, 128, 0],   // Dark Green
-  [0, 0, 128],   // Dark Blue
-  [128, 128, 0], // Olive
-  [128, 0, 128], // Purple
-  [0, 128, 128], // Teal
-  [192, 192, 192], // Silver
-  [128, 128, 128], // Gray
-  [255, 165, 0], // Orange
-  [0, 128, 64]   // Medium Sea Green
-];
-
-const monoToPalette = (p: number[][], max: number) =>
-  function (arr: ArrayLike<number>, w: number, h: number, res: Uint8ClampedArray) {
-    let choice = p.slice(0);
-    const choices = new Map<number, number[]>();
-    for (let i = 0; i < arr.length; ++i) {
-      let col: number[];
-      if (choices.has(arr[i])) {
-        col = choices.get(arr[i])!;
-      } else {
-        col = choice.shift()!;
-        choices.set(arr[i], col);
-        if (choice.length == 0)
-          choice = p.slice(0);
-      }
-      [res[i * 4], res[i * 4 + 1], res[i * 4 + 2]] = col;
-      res[i * 4 + 3] = 255;
-    }
-    return res;
-  };
-
-
-function monoToRgba(arr: ArrayLike<number>, w: number, h: number, res: Uint8ClampedArray) {
-  for (let i = 0; i < arr.length; ++i) {
-    res[i * 4] = res[i * 4 + 1] = res[i * 4 + 2] = arr[i] * 255;
-    res[i * 4 + 3] = 255;
-  }
-  return res;
-}
-
 function processCTCDecodedSequence(decodedSequence: number[], blankLabel = 0) {
  const result = [];
  let prevLabel = blankLabel;
@ -411,114 +345,21 @@ async function predict(img: HTMLImageElement, bg: HTMLImageElement, off: number)
  const image = await imageFromCanvas(img, bg, off);
  if (!image)
    throw new Error("Failed to gen image");
-  const mono = toMonochrome(image.data);
-  console.log(mono.reduce((a, b) => a + b), 0);
-  const labels = ccl.connectedComponentLabeling(mono, image.width, image.height);
-  const props = ccl.computeBounds(labels, image.width, image.height);
-
-  const sortedByArea = Object.entries(props).sort((a, b) => a[1].area - b[1].area);
-  const n = 8;
-  let eightBiggest = sortedByArea.slice(0, -n);
-  //const filtered = new Float32Array(80 * 300);
-
-  // TODO: maybe centering?
-  //imgDisp(monoToPalette(colors, Math.max(...new Set(labels))), labels, image.width, image.height);
-
-  for (const [label, region] of eightBiggest) {
-    //if ((region.maxRow - region.minRow) <= 20) {
-    //  continue;
-    //}
-
-    for (let y = region.minRow; y <= region.maxRow; ++y) {
-      for (let x = region.minCol; x <= region.maxCol; ++x) {
-        if (labels[y * image!.width + x] === +label) {
-          labels[y * image!.width + x] = 0;
-        }
-      }
-    }
-  }
-
-  eightBiggest = sortedByArea.slice(-n);
-  //imgDisp(monoToPalette(colors, Math.max(...new Set(labels))), labels, image.width, image.height);
+  const mono = toMonochromeFloat(image.data);

-  for (const [label, region] of eightBiggest) {
-    if ((region.maxRow - region.minRow) > 20) {
-      continue;
-    }
-
-    for (let y = region.minRow; y <= region.maxRow; ++y) {
-      for (let x = region.minCol; x <= region.maxCol; ++x) {
-        if (labels[y * image!.width + x] === +label) {
-          labels[y * image!.width + x] = 0;
-        }
-      }
-    }
-  }
-  //imgDisp(monoToPalette(colors, Math.max(...new Set(labels))), labels, image.width, image.height);
-
-  for (const [label, region] of eightBiggest) {
-    if ((region.maxRow - region.minRow) <= 20) {
-      continue;
-    }
-
-    for (let y = region.minRow; y <= region.maxRow; ++y) {
-      for (let x = region.minCol; x <= region.maxCol; ++x) {
-        if (labels[y * image!.width + x] === +label) {
-          labels[y * image!.width + x] = 1;
-        }
-      }
-    }
-  }
-
-  const filtered2 = tf.tensor3d(labels, [image.height, image.width, 1]).concat(tf.zeros([80, 300 - image.width, 1]), 1);
-  //imgDisp(monoToPalette(colors, Math.max(...new Set(labels))), labels, image.width, image.height);
-  //const tensor = tf.tensor3d(filtered, [80, 300, 1], 'float32');
-  //const tr = [1, 0, 2];
-  //console.log(tensor.shape, tensor.transpose(tr).shape);
+  const filtered2 = tf.tensor3d(mono, [image.height, image.width, 1]);
  const prediction = model.predict(filtered2.transpose([1, 0, 2]).expandDims(0));

  let d: tf.TypedArray;

-  if (!Array.isArray(prediction)) {
-    const v = greedyCTCDecode(prediction) as number[][];
-    console.log(v);
-    const s = processCTCDecodedSequence(v[0], charset.length + 1);
-    return indicesToSymbols(s).join('').trim();
-  } else
-    throw new Error("unexpected inference");
+  if (Array.isArray(prediction))
+    throw new Error("Unexpected inference type");

-  // createSequence(d);
-  return '';
+  const v = greedyCTCDecode(prediction) as number[][];
+  const s = processCTCDecodedSequence(v[0], charset.length + 1);
+  return indicesToSymbols(s).join('').trim();
 }

-function createSequence(prediction: any) {
-  const csl = charset.length;
-  const sequence: Record<string, number>[] = [];
-
-  // for each prediction
-  for (let pos = 0; pos < prediction.length; pos += csl) {
-    // look at the probabilities for the 22 token characters
-    const preds = prediction.slice(pos, pos + csl);
-    const max = Math.max(...preds);
-
-    const seqElem: Record<string, number> = {};
-
-    for (let i = 0; i < csl; i++) {
-      const p = preds[i] / max; // normalize probability
-      const c = charset[i + 1];
-
-      if (p >= 0.05) { // if it's probable enough
-        seqElem[c || ''] = p; // save its probability, to give alternative solutions
-      }
-    }
-
-    sequence.push(seqElem);
-  }
-
-  return sequence;
-}
-
-
 async function imageFromUri(uri: string) {
  if (uri.startsWith('url("')) {
    uri = uri.substr(5, uri.length - 7);
--- a/src/model.json
+++ b/src/model.json
--- a/src/model.weights.bin
+++ b/src/model.weights.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b3d1959699ed9381dc680dcb6676e90670c6841c2b97fcf696ff6c2db261e1f5
-size 15704896