From 47d8470372732d86295cbfc510e27ed29bea6696 Mon Sep 17 00:00:00 2001 From: Aidan Date: Wed, 23 Apr 2025 20:19:47 -0400 Subject: [PATCH] [scraper] func: add database, docker support, ingest functions, and basic menu --- README.md | 1 + UNLICENSE | 24 ++++++++++++ scraper/.gitignore | 43 +++++++++++++++++++++ scraper/docker-compose.yml | 10 +++++ scraper/drizzle.config.ts | 11 ++++++ scraper/package.json | 25 +++++++++++++ scraper/src/db/schema.ts | 18 +++++++++ scraper/src/index.ts | 60 ++++++++++++++++++++++++++++++ scraper/src/ingest/ingest.ts | 66 +++++++++++++++++++++++++++++++++ scraper/src/util/osFunctions.ts | 16 ++++++++ scraper/src/util/toScrape.ts | 28 ++++++++++++++ scraper/tsconfig.json | 28 ++++++++++++++ 12 files changed, 330 insertions(+) create mode 100644 README.md create mode 100644 UNLICENSE create mode 100644 scraper/.gitignore create mode 100644 scraper/docker-compose.yml create mode 100644 scraper/drizzle.config.ts create mode 100644 scraper/package.json create mode 100644 scraper/src/db/schema.ts create mode 100644 scraper/src/index.ts create mode 100644 scraper/src/ingest/ingest.ts create mode 100644 scraper/src/util/osFunctions.ts create mode 100644 scraper/src/util/toScrape.ts create mode 100644 scraper/tsconfig.json diff --git a/README.md b/README.md new file mode 100644 index 0000000..d4aee1a --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +# navigate diff --git a/UNLICENSE b/UNLICENSE new file mode 100644 index 0000000..efb9808 --- /dev/null +++ b/UNLICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to diff --git a/scraper/.gitignore b/scraper/.gitignore new file mode 100644 index 0000000..d7227d3 --- /dev/null +++ b/scraper/.gitignore @@ -0,0 +1,43 @@ +# dependencies (bun install) +node_modules + +# output +out +dist +*.tgz + +# code coverage +coverage +*.lcov + +# logs +logs +_.log +report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json + +# dotenv environment variable files +.env +.env.development.local +.env.test.local +.env.production.local +.env.local + +# caches +.eslintcache +.cache +*.tsbuildinfo + +# IntelliJ based IDEs +.idea + +# Finder (MacOS) folder config +.DS_Store + +# ingest files +src/ingest/urls.txt + +# postgres +postgres + +# bun +bun.lock* \ No newline at end of file diff --git a/scraper/docker-compose.yml b/scraper/docker-compose.yml new file mode 100644 index 0000000..066721f --- /dev/null +++ b/scraper/docker-compose.yml @@ -0,0 +1,10 @@ +services: + postgres: + image: postgres:17 + environment: + POSTGRES_PASSWORD: abfi29239ed98q93aEa89EriiaKaye896quAirhaAu + POSTGRES_DB: scraper + ports: + - 5432:5432 + volumes: + - ./postgres:/var/lib/postgresql/data \ No newline at end of file diff --git a/scraper/drizzle.config.ts b/scraper/drizzle.config.ts new file mode 100644 index 0000000..fb71383 --- /dev/null +++ b/scraper/drizzle.config.ts @@ -0,0 +1,11 @@ +import "dotenv/config" +import { defineConfig } from "drizzle-kit" + +export default defineConfig({ + out: "./drizzle", + schema: "./src/db/schema.ts", + dialect: "postgresql", + dbCredentials: { + url: process.env.DATABASE_URL!, + }, +}) \ No newline at end of file diff --git a/scraper/package.json b/scraper/package.json new file mode 100644 index 0000000..7cd33b1 --- /dev/null +++ b/scraper/package.json @@ -0,0 +1,25 @@ +{ + "name": "navigate", + "module": "src/index.ts", + "type": "module", + "private": true, + "scripts": { + "start": "tsx src/index.ts" + }, + "devDependencies": { + "@types/bun": "latest", + "@types/pg": "^8.11.13", + "drizzle-kit": "^0.31.0", + "tsx": "^4.19.3" + }, + "peerDependencies": { + "typescript": "^5" + }, + "dependencies": { + "axios": "^1.8.4", + "cheerio": "^1.0.0", + "dotenv": "^16.5.0", + "drizzle-orm": "^0.42.0", + "pg": "^8.15.5" + } +} diff --git a/scraper/src/db/schema.ts b/scraper/src/db/schema.ts new file mode 100644 index 0000000..6327eab --- /dev/null +++ b/scraper/src/db/schema.ts @@ -0,0 +1,18 @@ +import { integer, pgTable, varchar, timestamp } from "drizzle-orm/pg-core" + +export const scrapeQueue = pgTable("scrape_queue", { + id: integer().primaryKey().generatedAlwaysAsIdentity(), + url: varchar({ length: 255 }).notNull(), + status: varchar({ length: 255 }).notNull(), + createdAt: timestamp().notNull().defaultNow(), + updatedAt: timestamp().notNull().defaultNow(), +}) + +export const searchData = pgTable("search_data", { + id: integer().primaryKey().generatedAlwaysAsIdentity(), + url: varchar({ length: 255 }).notNull(), + title: varchar({ length: 255 }).notNull(), + description: varchar({ length: 255 }).notNull(), + createdAt: timestamp().notNull().defaultNow(), + updatedAt: timestamp().notNull().defaultNow(), +}) \ No newline at end of file diff --git a/scraper/src/index.ts b/scraper/src/index.ts new file mode 100644 index 0000000..1dda13a --- /dev/null +++ b/scraper/src/index.ts @@ -0,0 +1,60 @@ +import { checkIngest } from "./ingest/ingest" +import { clearScreen, truncate } from "./util/osFunctions" +import getRandomUrl from "./util/toScrape" +import * as readline from "readline" + +const rl = readline.createInterface({ + input: process.stdin, + output: process.stdout +}) + +function promptUser(question: string): Promise { + return new Promise((resolve) => { + rl.question(question, (answer) => { + resolve(answer) + }) + }) +} + +checkIngest() +console.log() + +async function main() { + while (true) { + const url = await getRandomUrl() + if (!url) { + console.log("No URLs to scrape") + rl.close() + process.exit(0) + } + + clearScreen() + + console.log("┌───────────────────────────────────────────────┐") + console.log("│ NAVIGATE SCRAPER │") + console.log("├───────────────────────────────────────────────┤") + console.log(`│ URL: ${truncate(url, { length: 40 })}... │`) + console.log("┢━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┪") + console.log("┃ [S]crape ┃ [Q]uit ┃") + console.log("┗━━━━━━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━┛\n") + + const input = await promptUser("> ") + if (input === "s") { + console.log("I would scrape now...") + } else if (input === "q") { + clearScreen() + console.log("\nExiting...\n") + rl.close() + process.exit(0) + } else { + clearScreen() + console.log("Invalid input. Please enter 's' to scrape or 'q' to quit.\n") + } + } +} + +main().catch(err => { + console.error("[!] Error:", err) + rl.close() + process.exit(1) +}) \ No newline at end of file diff --git a/scraper/src/ingest/ingest.ts b/scraper/src/ingest/ingest.ts new file mode 100644 index 0000000..33a5540 --- /dev/null +++ b/scraper/src/ingest/ingest.ts @@ -0,0 +1,66 @@ +import fs from "fs" +import path from "path" +import { fileURLToPath } from "url" +import { drizzle } from "drizzle-orm/node-postgres" +import { scrapeQueue } from "../db/schema" +import { NodePgDatabase } from "drizzle-orm/node-postgres" +import * as schema from "../db/schema" + +export function checkIngestCount() { + const urls = fs.readFileSync(path.join(path.dirname(fileURLToPath(import.meta.url)), "urls.txt"), "utf8") + const urlArray = urls.split("\n") + + // Cleanup + const blankLinesRemoved = urlArray.filter((url) => url.trim() !== "") + const duplicatesRemoved = blankLinesRemoved.filter((url, index, self) => self.indexOf(url) === index) + + return duplicatesRemoved.length +} + +export async function ingestUrls() { + const urls = fs.readFileSync(path.join(path.dirname(fileURLToPath(import.meta.url)), "urls.txt"), "utf8") + let urlArray = urls.split("\n") + + // Cleanup + const blankLinesRemoved = urlArray.filter((url) => url.trim() !== "") + const duplicatesRemoved = blankLinesRemoved.filter((url, index, self) => self.indexOf(url) === index) + + // Ingest + const db = drizzle(process.env.DATABASE_URL!) as NodePgDatabase + + let successCt = 0 + let failCt = 0 + + for (const url of duplicatesRemoved) { + try { + await db.insert(scrapeQueue).values({ + url, + status: "pending", + }) + + urlArray = urlArray.filter((u) => u !== url) + + successCt++ + } catch (error) { + console.error(`Failed to ingest: ${url} | ${error}`) + failCt++ + } + } + + fs.writeFileSync(path.join(path.dirname(fileURLToPath(import.meta.url)), "urls.txt"), urlArray.join("\n")) + + return { + success: successCt, + failure: failCt, + } +} + +export async function checkIngest() { + if (checkIngestCount() === 0) { + console.log("[i] No URLs to ingest") + } else { + console.log(`[i] Ingesting ${checkIngestCount()} URLs`) + const { success, failure } = await ingestUrls() + console.log(`[✓] Ingested ${success} URLs, failed to ingest ${failure} URLs`) + } +} \ No newline at end of file diff --git a/scraper/src/util/osFunctions.ts b/scraper/src/util/osFunctions.ts new file mode 100644 index 0000000..02f0539 --- /dev/null +++ b/scraper/src/util/osFunctions.ts @@ -0,0 +1,16 @@ +import { exec } from "child_process" + +export function clearScreen() { + const os = process.platform + + if (os === "win32") { + exec("cls") + } else { + exec("clear") + } +} + +export function truncate(str: string, { length }: { length: number }): string { + if (str.length <= length) return str; + return str.slice(0, length) + "..."; +} \ No newline at end of file diff --git a/scraper/src/util/toScrape.ts b/scraper/src/util/toScrape.ts new file mode 100644 index 0000000..e99c109 --- /dev/null +++ b/scraper/src/util/toScrape.ts @@ -0,0 +1,28 @@ +import "dotenv/config" +import { drizzle } from "drizzle-orm/node-postgres" +import { scrapeQueue } from "../db/schema" +import { eq, asc } from "drizzle-orm" +import { NodePgDatabase } from "drizzle-orm/node-postgres" +import * as schema from "../db/schema" +import { Pool } from "pg" + +const pool = new Pool({ + connectionString: process.env.DATABASE_URL, +}) + +const db = drizzle(pool, { schema }) as NodePgDatabase + +async function getRandomUrl() { + const url = await db.query.scrapeQueue.findFirst({ + where: eq(scrapeQueue.status, "pending"), + orderBy: [asc(scrapeQueue.createdAt)], + }) + + if (!url) { + return null + } + + return url.url +} + +export default getRandomUrl \ No newline at end of file diff --git a/scraper/tsconfig.json b/scraper/tsconfig.json new file mode 100644 index 0000000..9c62f74 --- /dev/null +++ b/scraper/tsconfig.json @@ -0,0 +1,28 @@ +{ + "compilerOptions": { + // Environment setup & latest features + "lib": ["ESNext"], + "target": "ESNext", + "module": "ESNext", + "moduleDetection": "force", + "jsx": "react-jsx", + "allowJs": true, + + // Bundler mode + "moduleResolution": "bundler", + "allowImportingTsExtensions": true, + "verbatimModuleSyntax": true, + "noEmit": true, + + // Best practices + "strict": true, + "skipLibCheck": true, + "noFallthroughCasesInSwitch": true, + "noUncheckedIndexedAccess": true, + + // Some stricter flags (disabled by default) + "noUnusedLocals": false, + "noUnusedParameters": false, + "noPropertyAccessFromIndexSignature": false + } +}