This commit is contained in:
2025-11-30 14:00:46 +01:00
parent f8a1fbeb50
commit 9a1feb2912
37 changed files with 3079 additions and 7301 deletions

View File

@@ -0,0 +1,216 @@
// DOM extraction logic for Playwright-based HTML dumps.
// Runs in the browser context via page.evaluate.
export type ExportedElement = {
el: string;
x: string;
t?: string;
l?: string;
p?: string;
n?: string;
i?: string;
d?: string;
r?: string;
h?: string;
};
export const extractDom = (): ExportedElement[] => {
const MAX_TEXT = 60;
const clean = (t: string | null): string | null =>
t ? t.replace(/\s+/g, " ").trim().slice(0, MAX_TEXT) : null;
const isDynamicId = (id: string | null): boolean =>
!!id && (id.includes(":-") || /:[a-z0-9]+:/i.test(id));
const shortTag = (t: string): string =>
({
BUTTON: "bu",
A: "a",
INPUT: "in",
SELECT: "s",
TEXTAREA: "ta",
DIV: "d",
SPAN: "sp",
} as Record<string, string>)[t] || t.toLowerCase();
const isNoiseClass = (c: string): boolean =>
!c ||
c.length < 3 ||
/^css-/.test(c) ||
/^[a-z0-9]{6,}$/i.test(c) ||
/^\\w{1,3}-\\w{4,}$/.test(c);
const isStableDataAttr = (name: string, value: string | null): boolean =>
name === "data-testid" ||
name === "data-modal-component" ||
name === "data-modal-id" ||
(value != null &&
value.length <= 40 &&
!/\\s/.test(value) &&
/^[a-z0-9_.:-]+$/i.test(value));
const siblingIndex = (node: Element) => {
const parent = node.parentElement;
if (!parent) return { idx: 0, count: 1 };
const siblings = Array.from(parent.children).filter(
(n) => (n as Element).tagName === node.tagName
);
const idx = siblings.indexOf(node);
return { idx, count: siblings.length };
};
const getPaths = (el: Element): { full: string; key: string } => {
const parts: string[] = [];
const keyParts: string[] = [];
let node: Node | null = el;
while (node && node.nodeType === 1 && node !== document.body) {
const element = node as Element;
const tag = shortTag(element.tagName);
const id = element.id || null;
const attrNames = element.getAttributeNames
? element.getAttributeNames()
: [];
const attrs: string[] = [];
const keyAttrs: string[] = [];
if (id) {
const idChunk = "#" + id;
attrs.push(idChunk);
keyAttrs.push(idChunk);
}
for (const a of attrNames) {
if (!a.startsWith("data-")) continue;
const v = element.getAttribute(a);
if (!isStableDataAttr(a, v)) continue;
const attrChunk = "[" + a + "=" + v + "]";
attrs.push(attrChunk);
if (
a === "data-testid" ||
a === "data-modal-component" ||
a === "data-modal-id"
) {
keyAttrs.push(attrChunk);
}
}
const role = element.getAttribute ? element.getAttribute("role") : null;
if (role) {
const roleChunk = "[r=" + role + "]";
attrs.push(roleChunk);
keyAttrs.push(roleChunk);
}
let chunk = tag;
let keyChunk = tag;
if (attrs.length > 0) {
chunk += attrs.join("");
} else {
let cls = Array.from(element.classList).filter(
(c) => !isNoiseClass(c)
);
if (cls.length > 2) cls = cls.slice(0, 2);
if (!cls.length) cls = ["c0"];
const clsChunk = "." + cls.join(".");
chunk += clsChunk;
}
if (keyAttrs.length > 0) {
keyChunk += keyAttrs.join("");
} else {
let cls = Array.from(element.classList).filter(
(c) => !isNoiseClass(c)
);
if (cls.length > 2) cls = cls.slice(0, 2);
if (!cls.length) cls = ["c0"];
keyChunk += "." + cls.join(".");
}
parts.unshift(chunk);
keyParts.unshift(keyChunk);
node = element.parentNode;
}
return { full: parts.join(">"), key: keyParts.join(">") };
};
const items: ExportedElement[] = [];
const seenStructure = new Set<string>();
const seen = new Map<string, number>();
const addItem = (o: ExportedElement & { keyPath: string }) => {
const structureKey = o.el + "|" + o.keyPath;
if (seenStructure.has(structureKey)) return;
seenStructure.add(structureKey);
const keyParts = [o.el, o.x];
if (o.t) keyParts.push("t=" + o.t);
if (o.l) keyParts.push("l=" + o.l);
if (o.p) keyParts.push("p=" + o.p);
if (o.n) keyParts.push("n=" + o.n);
if (o.i) keyParts.push("i=" + o.i);
if (o.d) keyParts.push("d=" + o.d);
if (o.r) keyParts.push("r=" + o.r);
const key = keyParts.join("|");
const prev = seen.get(key) || 0;
if (prev > 0) {
let hVal = 0;
const str = key + "#" + prev;
for (let i = 0; i < str.length; i++) {
hVal = (hVal * 31 + str.charCodeAt(i)) >>> 0;
}
const hex = (hVal & 0xfff).toString(16).padStart(3, "0");
o.h = hex;
}
seen.set(key, prev + 1);
const { keyPath, ...rest } = o;
items.push(rest);
};
const elements = Array.from(
document.querySelectorAll<HTMLElement>("button,a,input,select,textarea")
);
for (const e of elements) {
const t = clean(e.innerText);
const l = clean(e.getAttribute("aria-label"));
const p = clean(e.getAttribute("placeholder"));
const n = e.getAttribute("name");
const r = e.getAttribute("role");
const id = e.id || null;
const stableId = isDynamicId(id);
const d = e.getAttribute("data-testid");
if (r === "menuitem" && !t && !l && !p) continue;
if (!(t || l || p || n || !stableId || d || r)) continue;
const { full, key } = getPaths(e);
const o: ExportedElement & { keyPath: string } = {
el: shortTag(e.tagName),
x: full,
keyPath: key,
};
if (t) o.t = t;
if (l && l !== t) o.l = l;
if (p && p !== t && p !== l) o.p = p;
if (n) o.n = n;
if (!stableId && id) o.i = id;
if (d) o.d = d;
if (r) o.r = r;
addItem(o);
}
return items;
};

View File

@@ -2,309 +2,23 @@
// NOT for production automation; intended as a developer utility to generate compact DOM exports
// for manual inspection and to aid writing Playwright automations.
const { chromium } = require("playwright");
const fs = require("fs").promises;
const path = require("path");
import { chromium } from "playwright";
import { promises as fs } from "fs";
import path from "path";
import { extractDom, ExportedElement } from "./domExtractor";
const INPUT_DIR = path.join(process.cwd(), "html-dumps");
const OUTPUT_DIR = path.join(process.cwd(), "html-dumps-optimized");
const domExportScript = `(() => {
const MAX_TEXT = 60;
const clean = t =>
t ? t.replace(/\\s+/g, " ").trim().slice(0, MAX_TEXT) : null;
const isDynamicId = id =>
id && (id.includes(":-") || /:[a-z0-9]+:/i.test(id));
const shortTag = t => ({
BUTTON: "bu",
A: "a",
INPUT: "in",
SELECT: "s",
TEXTAREA: "ta",
DIV: "d",
SPAN: "sp"
}[t] || t.toLowerCase());
const isNoiseClass = c =>
!c ||
c.length < 3 ||
/^css-/.test(c) ||
/^[a-z0-9]{6,}$/i.test(c) ||
/^\\w{1,3}-\\w{4,}$/.test(c);
const siblingIndex = node => {
const sib = [...node.parentNode.children]
.filter(n => n.tagName === node.tagName);
return { idx: sib.indexOf(node), count: sib.length };
};
const getSemSiblingPath = el => {
const parts = [];
let node = el;
while (node && node.nodeType === 1 && node !== document.body) {
const tag = shortTag(node.tagName);
const { idx, count } = siblingIndex(node);
const isTarget = node === el;
const targetSuffix = isTarget && idx >= 0 ? ":" + idx : "";
const parentSuffix = !isTarget && count > 1 && idx >= 0 ? "@" + idx : "";
const sibSuffix = targetSuffix || parentSuffix;
const id = node.id;
const attrNames = node.getAttributeNames ? node.getAttributeNames() : [];
const attrs = [];
let hasDataAttr = false;
if (id) attrs.push("#" + id);
for (const a of attrNames) {
if (a.startsWith("data-")) {
attrs.push("[" + a + "=" + node.getAttribute(a) + "]");
hasDataAttr = true;
}
}
const role = node.getAttribute ? node.getAttribute("role") : null;
if (role) attrs.push("[r=" + role + "]");
let chunk = tag;
if (attrs.length > 0) {
chunk += attrs.join("");
} else {
let cls = [...node.classList].filter(c => !isNoiseClass(c));
if (cls.length > 2) cls = cls.slice(0, 2);
if (!cls.length) cls = ["c0"];
chunk += "." + cls.join(".");
}
chunk += (sibSuffix || "");
parts.unshift(chunk);
node = node.parentNode;
}
return parts.join(">");
};
const items = [];
const seen = new Map();
const addItem = o => {
const keyParts = [o.el, o.x];
if (o.t) keyParts.push("t=" + o.t);
if (o.l) keyParts.push("l=" + o.l);
if (o.p) keyParts.push("p=" + o.p);
if (o.n) keyParts.push("n=" + o.n);
if (o.i) keyParts.push("i=" + o.i);
if (o.d) keyParts.push("d=" + o.d);
if (o.r) keyParts.push("r=" + o.r);
const key = keyParts.join("|");
const prev = seen.get(key) || 0;
if (prev > 0) {
let h = 0;
const str = key + "#" + prev;
for (let i = 0; i < str.length; i++) {
h = (h * 31 + str.charCodeAt(i)) >>> 0;
}
const hex = (h & 0xfff).toString(16).padStart(3, "0");
o.h = hex;
}
seen.set(key, prev + 1);
items.push(o);
};
const elements = [...document.querySelectorAll("button,a,input,select,textarea")];
for (const e of elements) {
const t = clean(e.innerText);
const l = clean(e.getAttribute("aria-label"));
const p = clean(e.getAttribute("placeholder"));
const n = e.getAttribute("name");
const r = e.getAttribute("role");
const id = e.id;
const stableId = isDynamicId(id) ? null : id;
const d = e.getAttribute("data-testid");
if (r === "menuitem" && !t && !l && !p) continue;
if (!(t || l || p || n || stableId || d || r)) continue;
const o = { el: shortTag(e.tagName), x: getSemSiblingPath(e) };
if (t) o.t = t;
if (l && l !== t) o.l = l;
if (p && p !== t && p !== l) o.p = p;
if (n) o.n = n;
if (stableId) o.i = stableId;
if (d) o.d = d;
if (r) o.r = r;
addItem(o);
}
const json = JSON.stringify(items, null, 2);
console.log("chars:", json.length);
console.log("elements:", items.length);
console.log(items);
return items;
})();`;
const domExtractor = `() => {
const MAX_TEXT = 60;
const clean = t =>
t ? t.replace(/\\s+/g, " ").trim().slice(0, MAX_TEXT) : null;
const isDynamicId = id =>
id && (id.includes(":-") || /:[a-z0-9]+:/i.test(id));
const shortTag = t => ({
BUTTON: "bu",
A: "a",
INPUT: "in",
SELECT: "s",
TEXTAREA: "ta",
DIV: "d",
SPAN: "sp"
}[t] || t.toLowerCase());
const isNoiseClass = c =>
!c ||
c.length < 3 ||
/^css-/.test(c) ||
/^[a-z0-9]{6,}$/i.test(c) ||
/^\\w{1,3}-\\w{4,}$/.test(c);
const siblingIndex = node => {
const sib = [...node.parentNode.children]
.filter(n => n.tagName === node.tagName);
return { idx: sib.indexOf(node), count: sib.length };
};
const getSemSiblingPath = el => {
const parts = [];
let node = el;
while (node && node.nodeType === 1 && node !== document.body) {
const tag = shortTag(node.tagName);
const { idx, count } = siblingIndex(node);
const isTarget = node === el;
const targetSuffix = isTarget && idx >= 0 ? ":" + idx : "";
const parentSuffix = !isTarget && count > 1 && idx >= 0 ? "@" + idx : "";
const sibSuffix = targetSuffix || parentSuffix;
const id = node.id;
const attrNames = node.getAttributeNames ? node.getAttributeNames() : [];
const attrs = [];
let hasDataAttr = false;
if (id) attrs.push("#" + id);
for (const a of attrNames) {
if (a.startsWith("data-")) {
attrs.push("[" + a + "=" + node.getAttribute(a) + "]");
hasDataAttr = true;
}
}
const role = node.getAttribute ? node.getAttribute("role") : null;
if (role) attrs.push("[r=" + role + "]");
let chunk = tag;
if (attrs.length > 0) {
chunk += attrs.join("");
} else {
let cls = [...node.classList].filter(c => !isNoiseClass(c));
if (cls.length > 2) cls = cls.slice(0, 2);
if (!cls.length) cls = ["c0"];
chunk += "." + cls.join(".");
}
chunk += (sibSuffix || "");
parts.unshift(chunk);
node = node.parentNode;
}
return parts.join(">");
};
const items = [];
const seen = new Map();
const addItem = o => {
const keyParts = [o.el, o.x];
if (o.t) keyParts.push("t=" + o.t);
if (o.l) keyParts.push("l=" + o.l);
if (o.p) keyParts.push("p=" + o.p);
if (o.n) keyParts.push("n=" + o.n);
if (o.i) keyParts.push("i=" + o.i);
if (o.d) keyParts.push("d=" + o.d);
if (o.r) keyParts.push("r=" + o.r);
const key = keyParts.join("|");
const prev = seen.get(key) || 0;
if (prev > 0) {
let h = 0;
const str = key + "#" + prev;
for (let i = 0; i < str.length; i++) {
h = (h * 31 + str.charCodeAt(i)) >>> 0;
}
const hex = (h & 0xfff).toString(16).padStart(3, "0");
o.h = hex;
}
seen.set(key, prev + 1);
items.push(o);
};
const elements = [...document.querySelectorAll("button,a,input,select,textarea")];
for (const e of elements) {
const t = clean(e.innerText);
const l = clean(e.getAttribute("aria-label"));
const p = clean(e.getAttribute("placeholder"));
const n = e.getAttribute("name");
const r = e.getAttribute("role");
const id = e.id;
const stableId = isDynamicId(id) ? null : id;
const d = e.getAttribute("data-testid");
if (r === "menuitem" && !t && !l && !p) continue;
if (!(t || l || p || n || stableId || d || r)) continue;
const o = { el: shortTag(e.tagName), x: getSemSiblingPath(e) };
if (t) o.t = t;
if (l && l !== t) o.l = l;
if (p && p !== t && p !== l) o.p = p;
if (n) o.n = n;
if (stableId) o.i = stableId;
if (d) o.d = d;
if (r) o.r = r;
addItem(o);
}
return items;
}`;
module.exports = { domExportScript };
async function ensureDir(dir: string) {
async function ensureDir(dir: string): Promise<void> {
try {
await fs.mkdir(dir, { recursive: true });
} catch {}
} catch {
// ignore
}
}
async function exportAll() {
async function exportAll(): Promise<void> {
await ensureDir(OUTPUT_DIR);
async function collectHtmlFiles(dir: string): Promise<string[]> {
@@ -321,7 +35,7 @@ async function exportAll() {
return results;
}
let htmlFiles = [];
let htmlFiles: string[] = [];
try {
htmlFiles = await collectHtmlFiles(INPUT_DIR);
} catch (err) {
@@ -346,13 +60,26 @@ async function exportAll() {
const url = "file://" + abs;
const page = await browser.newPage();
try {
await page.goto(url, { waitUntil: "domcontentloaded", timeout: 10000 });
const items = await page.evaluate(
new Function("return (" + domExtractor + ")()")
await page.addInitScript({
content: "window.__name = window.__name || (fn => fn);",
});
await page.goto(url, {
waitUntil: "domcontentloaded",
timeout: 10000,
});
const items = (await page.evaluate(
extractDom as () => ExportedElement[]
)) as unknown as ExportedElement[];
const outPath = path.join(
OUTPUT_DIR,
file.replace(/\.html$/, ".json")
);
const outPath = path.join(OUTPUT_DIR, file.replace(/\.html$/, ".json"));
await fs.mkdir(path.dirname(outPath), { recursive: true });
await fs.writeFile(outPath, JSON.stringify(items, null, 2), "utf8");
await fs.writeFile(
outPath,
JSON.stringify(items, null, 2),
"utf8"
);
console.log(
"exported " +
file +
@@ -373,7 +100,7 @@ async function exportAll() {
}
}
if (require.main === module) {
if (typeof require !== "undefined" && require.main === module) {
exportAll().catch((err) => {
console.error(err);
process.exit(1);

View File

@@ -0,0 +1,58 @@
#!/usr/bin/env node
const fs = require("fs/promises");
const path = require("path");
const { spawnSync } = require("child_process");
const ROOT = process.cwd();
const HTML_DUMPS_DIR = path.join(ROOT, "html-dumps");
const EXPORTS_DIR = path.join(ROOT, "html-dumps-optimized");
const npxCmd = process.platform === "win32" ? "npx.cmd" : "npx";
async function removeExportsDir(): Promise<void> {
await fs.rm(EXPORTS_DIR, { recursive: true, force: true });
}
function runStep(
cmd: string,
args: string[],
options: { cwd?: string } = {}
): void {
const result = spawnSync(cmd, args, { stdio: "inherit", ...options });
if (result.status !== 0) {
throw new Error(
cmd + " " + args.join(" ") + " failed with code " + result.status
);
}
}
async function processWorkflows(): Promise<void> {
await removeExportsDir();
runStep(npxCmd, ["tsx", "scripts/dom-export/exportHtmlDumps.ts"]);
const entries = await fs.readdir(HTML_DUMPS_DIR, { withFileTypes: true });
for (const entry of entries) {
if (!entry.isDirectory()) continue;
const exportWorkflowDir = path.join(EXPORTS_DIR, entry.name);
try {
await fs.access(exportWorkflowDir);
} catch {
continue;
}
runStep(npxCmd, [
"tsx",
"scripts/dom-export/buildDomDiffs.ts",
exportWorkflowDir,
]);
}
}
processWorkflows().catch((err: unknown) => {
console.error(err);
process.exit(1);
});
export {};