Browse Source

Add Perceptual-hash filter option

pull/46/head
coomdev 2 years ago
parent
commit
e58343db34
  1. 7
      .gitignore
  2. 22
      .vscode/settings.json
  3. 10
      main.d.ts
  4. 2
      main.meta.js
  5. 2487
      main.user.js
  6. 907
      package-lock.json
  7. 4
      package.json
  8. 16
      src/Components/App.svelte
  9. 13
      src/main.ts
  10. 68
      src/phash.ts
  11. 2
      src/stores.ts
  12. 62
      src/thirdeye.ts
  13. 4
      src/websites/index.ts

7
.gitignore

@ -15,4 +15,9 @@ ext
*.webm
yarn.lock
out
*.data
*.data
ext.js
core.js
fag
rollup.config.js
.vscode/settings.json

22
.vscode/settings.json

@ -1,3 +1,23 @@
{
"typescript.tsdk": "node_modules/typescript/lib"
"typescript.tsdk": "node_modules/typescript/lib",
"files.exclude": {
"**/dist": true,
"**/node_modules": true,
"**/*.png": true,
"**/*.html": true,
"**/*.c": true,
"**/*~": true,
"**/*.jpg": true,
"**/*.gif": true,
"**/*.exe": true,
"**/*.cs": true,
"**/*.out": true,
"**/*.zip": true,
"**/ext": true,
"**/*.mp4": true,
"**/*.webm": true,
"**/yarn.lock": true,
"**/out": true,
"**/*.data": true
}
}

10
main.d.ts

@ -6,4 +6,14 @@ declare module '*.png' {
export default new Uint8Array;
}
declare module 'blockhash' {
export const hammingDistance: (a: string, b: string) => number;
export const blockhash: () => void;
export const blockhashData: (imgData: {
width: number,
height: number,
data: Uint8Array
}, bits: number, method: number) => string;
}
declare const QR: any;

2
main.meta.js

@ -1,7 +1,7 @@
// ==UserScript==
// @name PNGExtraEmbed
// @namespace https://coom.tech/
// @version 0.156
// @version 0.157
// @description uhh
// @author You
// @match https://boards.4channel.org/*

2487
main.user.js

File diff suppressed because it is too large

907
package-lock.json

File diff suppressed because it is too large

4
package.json

@ -14,10 +14,14 @@
"author": "",
"license": "ISC",
"dependencies": {
"blockhash": "^0.2.0",
"buffer": "^6.0.3",
"crc-32": "^1.2.0",
"events": "^3.3.0",
"file-type": "^17.0.2",
"image-hash": "^5.0.1",
"jpeg-js": "^0.4.3",
"png-js": "^1.0.0",
"readable-stream": "^3.6.0",
"ts-ebml": "^2.0.2"
},

16
src/Components/App.svelte

@ -2,7 +2,6 @@
import { hasContext, onDestroy } from 'svelte'
import Dialog from './Dialog.svelte'
import Tag from './Tag.svelte'
import type { Booru } from '../thirdeye'
import Tabs from './Tabs.svelte'
@ -129,6 +128,21 @@
Disable third-eye.
</label>
{#if !$settings.te}
<label>
<input type="checkbox" bind:checked={$settings.phash} />
Enable perceptual hash-based filtering
</label>
{#if $settings.phash}
<label>
<input type="number" bind:value={$settings.mdist} />
Minimum distance required (5 recommended)
<!-- svelte-ignore a11y-missing-attribute -->
<a
title="Higher will filter more potentially different images, lower will let more identical images through"
>?</a
>
</label>
{/if}
<h3>Booru sources</h3>
<div class="tagcont">
{#each $settings.rsources as source, i}

13
src/main.ts

@ -27,7 +27,7 @@ import { lolisafe } from "./filehosts";
export interface ImageProcessor {
skip?: true;
match(fn: string): boolean;
has_embed(b: Buffer, fn?: string): boolean | Promise<boolean>;
has_embed(b: Buffer, fn?: string, prevurl?: string): boolean | Promise<boolean>;
extract(b: Buffer, fn?: string): EmbeddedFile[] | Promise<EmbeddedFile[]>;
inject?(b: File, c: File[]): Buffer | Promise<Buffer>;
}
@ -95,13 +95,13 @@ type EmbeddedFileWithoutPreview = {
export type EmbeddedFile = EmbeddedFileWithPreview | EmbeddedFileWithoutPreview;
const processImage = async (src: string, fn: string, hex: string, onfound: () => void): Promise<([EmbeddedFile[], boolean] | undefined)[]> => {
const processImage = async (src: string, fn: string, hex: string, prevurl: string, onfound: () => void): Promise<([EmbeddedFile[], boolean] | undefined)[]> => {
return Promise.all(processors.filter(e => e.match(fn)).map(async proc => {
if (proc.skip) {
// skip file downloading, file is referenced from the filename
// basically does things like filtering out blacklisted tags
const md5 = Buffer.from(hex, 'base64');
if (await proc.has_embed(md5, fn) === true) {
if (await proc.has_embed(md5, fn, prevurl) === true) {
onfound();
return [await proc.extract(md5, fn), true] as [EmbeddedFile[], boolean];
} return;
@ -141,7 +141,10 @@ const processPost = async (post: HTMLDivElement) => {
const origlink = qp.getImageLink(post);
if (!origlink)
return;
let res2 = await processImage(origlink, qp.getFilename(post), qp.getMD5(post),
const thumbLink = qp.getThumbnailLink(post);
if (!thumbLink)
return;
let res2 = await processImage(origlink, qp.getFilename(post), qp.getMD5(post), thumbLink,
() => {
post.querySelector('.post')?.classList.add("embedfound");
});
@ -274,7 +277,7 @@ const scrapeBoard = async (self: HTMLButtonElement) => {
self.textContent = "Copy Results";
self.disabled = false;
self.onclick = () => {
copyTextToClipboard(text);
copyTextToClipboard(text);
};
};

68
src/phash.ts

@ -0,0 +1,68 @@
const median = (data: number[]) => {
const mdarr = data.slice(0);
mdarr.sort((a, b) => a - b);
if (mdarr.length % 2 === 0)
return (mdarr[mdarr.length / 2 - 1] + mdarr[mdarr.length / 2]) / 2.0;
return mdarr[Math.floor(mdarr.length / 2)];
};
const translate_blocks_to_bits = function (blocks: number[], pixels_per_block: number) {
const half_block_value = pixels_per_block * 256 * 3 / 2;
const bandsize = blocks.length / 4;
// Compare medians across four horizontal bands
for (let i = 0; i < 4; i++) {
const m = median(blocks.slice(i * bandsize, (i + 1) * bandsize));
for (let j = i * bandsize; j < (i + 1) * bandsize; j++) {
const v = blocks[j];
blocks[j] = Number(v > m || (Math.abs(v - m) < 1 && m > half_block_value));
}
}
};
const bits_to_hexhash = (bitsArray: number[]) => {
const hex = [];
for (let i = 0; i < bitsArray.length; i += 4) {
const nibble = bitsArray.slice(i, i + 4);
hex.push(parseInt(nibble.join(''), 2).toString(16));
}
return hex.join('');
};
export const bmvbhash_even = (data: {
width: number;
height: number;
data: Uint8Array;
}, bits: number) => {
const blocksize_x = Math.floor(data.width / bits);
const blocksize_y = Math.floor(data.height / bits);
const result = [];
for (let y = 0; y < bits; y++) {
for (let x = 0; x < bits; x++) {
let total = 0;
for (let iy = 0; iy < blocksize_y; iy++) {
for (let ix = 0; ix < blocksize_x; ix++) {
const cx = x * blocksize_x + ix;
const cy = y * blocksize_y + iy;
const ii = (cy * data.width + cx) * 4;
const alpha = data.data[ii + 3];
if (alpha === 0) {
total += 765;
} else {
total += data.data[ii] + data.data[ii + 1] + data.data[ii + 2];
}
}
}
result.push(total);
}
}
translate_blocks_to_bits(result, blocksize_x * blocksize_y);
return bits_to_hexhash(result);
};

2
src/stores.ts

@ -23,6 +23,8 @@ export const settings = writable(localLoad('settingsv2', {
sh: false,
ep: false,
expte: false,
mdist: -1,
phash: false,
hotlink: false,
vercheck: false,
fhost: 0,

62
src/thirdeye.ts

@ -2,6 +2,8 @@ import type { EmbeddedFile, ImageProcessor } from "./main";
import { GM_fetch } from "./requests";
import { localLoad, settings } from "./stores";
import { Buffer } from "buffer";
import jpeg from 'jpeg-js';
import { bmvbhash_even } from "./phash";
export let csettings: Parameters<typeof settings['set']>[0];
settings.subscribe(b => {
@ -50,13 +52,20 @@ const gelquirk: (s: string) => tran = prefix => (a =>
} as BooruMatch)) || []);
let experimentalApi = false;
let black = new Set<string>();
let phashEn = false;
let mindist = 5;
settings.subscribe(s => {
experimentalApi = s.expte;
boorus = s.rsources.map(e => ({
...e,
quirks: gelquirk(e.view)
}));
black = new Set(s.blacklist);
mindist = s.mdist || 5;
phashEn = s.phash;
});
export let boorus: Booru[] =
localLoad('settingsv2', { rsources: [] as (Omit<Booru, 'quirks'> & { view: string, disabled?: boolean })[] })
.rsources.map(e => ({
@ -64,12 +73,6 @@ export let boorus: Booru[] =
quirks: gelquirk(e.view)
}));
let black = new Set<string>();
settings.subscribe(s => {
black = new Set(s.blacklist);
});
const bufferingTime = 2000;
let expired: number | undefined = undefined;
type ApiResult = { [md5 in string]: { [domain in string]: BooruMatch[] } };
@ -157,18 +160,37 @@ const extract = async (b: Buffer, fn?: string) => {
const full = result[0].full_url;
return [{
source: result[0].source,
page: { title: booru, url: result[0].page },
page: {
title: booru,
url: result[0].page
},
filename: fn!.substring(0, 33) + result[0].ext,
thumbnail: (await (await GM_fetch(prev || full)).arrayBuffer()), // prefer preview
data: csettings.hotlink ? (full || prev) : async (lsn) => {
thumbnail: (await (await GM_fetch(prev || full)).arrayBuffer()),
data: csettings.hotlink ? (full || prev) : (async (lsn) => {
if (!cachedFile)
cachedFile = (await (await GM_fetch(full || prev, undefined, lsn)).arrayBuffer()); // prefer full
cachedFile = (await (await GM_fetch(full || prev, undefined, lsn)).arrayBuffer());
return cachedFile;
}
})
} as EmbeddedFile];
};
const has_embed = async (b: Buffer, fn?: string) => {
const phash = (b: Buffer) => {
const res = jpeg.decode(b);
return bmvbhash_even(res, 8);
};
// a & b are hex strings
const hammingDist = (a: string, b: string) => {
let res = BigInt('0x' + a) ^ BigInt('0x' + b);
let acc = 0;
while (res != 0n) {
acc += Number(res & 1n);
res >>= 1n;
}
return acc;
};
const has_embed = async (b: Buffer, fn?: string, prevlink?: string) => {
// It's not worth to bother skipping images with filenames that match their md5 because
// 4chan reencodes jpegs, which is well over half the files posted
@ -185,6 +207,22 @@ const has_embed = async (b: Buffer, fn?: string) => {
if (result.length)
break;
}
if ((result && result.length != 0) && phashEn && prevlink) {
const getHash = async (l: string) => {
const ogreq = await GM_fetch(l);
const origPreview = await ogreq.arrayBuffer();
return await phash(Buffer.from(origPreview));
};
const [orighash, tehash] = await Promise.all([
getHash(prevlink),
getHash(result[0].preview_url)
]);
const d = hammingDist(orighash, tehash);
console.log(d, prevlink);
return d > mindist;
}
return result && result.length != 0;
};

4
src/websites/index.ts

@ -5,6 +5,7 @@ export type QueryProcessor = {
settingsHost: () => HTMLSpanElement;
catalogControlHost: () => HTMLDivElement;
getImageLink: (post: HTMLElement) => string;
getThumbnailLink: (post: HTMLElement) => string;
getFilename: (post: HTMLElement) => string;
getMD5: (post: HTMLElement) => string;
getInfoBox: (post: HTMLElement) => HTMLElement;
@ -24,6 +25,7 @@ export const V4chan: QueryProcessor = {
return a?.textContent || '';
},
getMD5: (post: HTMLElement) => post.querySelector("img[data-md5]")?.getAttribute("data-md5") || '',
getThumbnailLink: (post: HTMLElement) => post.querySelector("img[data-md5]")?.getAttribute("src") || '',
getInfoBox: post => post.querySelector("div.fileText")!
};
@ -40,6 +42,7 @@ export const X4chan: QueryProcessor = {
return (origlink.querySelector('.fnfull') || origlink)?.textContent || '';
},
getMD5: (post: HTMLElement) => post.querySelector("img[data-md5]")?.getAttribute("data-md5") || '',
getThumbnailLink: (post: HTMLElement) => post.querySelector("img[data-md5]")?.getAttribute("src") || '',
getInfoBox: post => post.querySelector("span.file-info")!
};
@ -58,6 +61,7 @@ export const FoolFuuka: QueryProcessor = {
return a?.title || '';
},
getMD5: (post: HTMLElement) => post.querySelector("img[data-md5]")?.getAttribute("data-md5") || '',
getThumbnailLink: (post: HTMLElement) => post.querySelector("img[data-md5]")?.getAttribute("src") || '',
getInfoBox: post => post.querySelector("span.post_controls")!
};

Loading…
Cancel
Save