[scraper] func: add database, docker support, ingest functions, and basic menu

2025-04-23 20:19:47 -04:00 · 2025-04-23 20:19:47 -04:00 · 47d8470372
commit 47d8470372
12 changed files with 330 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1 @@
+# navigate
--- a/24
+++ b/24
@ -0,0 +1,24 @@
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <https://unlicense.org/>
--- a/scraper/.gitignore
+++ b/scraper/.gitignore
@ -0,0 +1,43 @@
+# dependencies (bun install)
+node_modules
+
+# output
+out
+dist
+*.tgz
+
+# code coverage
+coverage
+*.lcov
+
+# logs
+logs
+_.log
+report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json
+
+# dotenv environment variable files
+.env
+.env.development.local
+.env.test.local
+.env.production.local
+.env.local
+
+# caches
+.eslintcache
+.cache
+*.tsbuildinfo
+
+# IntelliJ based IDEs
+.idea
+
+# Finder (MacOS) folder config
+.DS_Store
+
+# ingest files
+src/ingest/urls.txt
+
+# postgres
+postgres
+
+# bun
+bun.lock*
--- a/scraper/docker-compose.yml
+++ b/scraper/docker-compose.yml
@ -0,0 +1,10 @@
+services:
+  postgres:
+    image: postgres:17
+    environment:
+      POSTGRES_PASSWORD: abfi29239ed98q93aEa89EriiaKaye896quAirhaAu
+      POSTGRES_DB: scraper
+    ports:
+      - 5432:5432
+    volumes:
+      - ./postgres:/var/lib/postgresql/data
--- a/scraper/drizzle.config.ts
+++ b/scraper/drizzle.config.ts
@ -0,0 +1,11 @@
+import "dotenv/config"
+import { defineConfig } from "drizzle-kit"
+
+export default defineConfig({
+  out: "./drizzle",
+  schema: "./src/db/schema.ts",
+  dialect: "postgresql",
+  dbCredentials: {
+    url: process.env.DATABASE_URL!,
+  },
+})
--- a/scraper/package.json
+++ b/scraper/package.json
@ -0,0 +1,25 @@
+{
+  "name": "navigate",
+  "module": "src/index.ts",
+  "type": "module",
+  "private": true,
+  "scripts": {
+    "start": "tsx src/index.ts"
+  },
+  "devDependencies": {
+    "@types/bun": "latest",
+    "@types/pg": "^8.11.13",
+    "drizzle-kit": "^0.31.0",
+    "tsx": "^4.19.3"
+  },
+  "peerDependencies": {
+    "typescript": "^5"
+  },
+  "dependencies": {
+    "axios": "^1.8.4",
+    "cheerio": "^1.0.0",
+    "dotenv": "^16.5.0",
+    "drizzle-orm": "^0.42.0",
+    "pg": "^8.15.5"
+  }
+}
--- a/scraper/src/db/schema.ts
+++ b/scraper/src/db/schema.ts
@ -0,0 +1,18 @@
+import { integer, pgTable, varchar, timestamp } from "drizzle-orm/pg-core"
+
+export const scrapeQueue = pgTable("scrape_queue", {
+  id: integer().primaryKey().generatedAlwaysAsIdentity(),
+  url: varchar({ length: 255 }).notNull(),
+  status: varchar({ length: 255 }).notNull(),
+  createdAt: timestamp().notNull().defaultNow(),
+  updatedAt: timestamp().notNull().defaultNow(),
+})
+
+export const searchData = pgTable("search_data", {
+  id: integer().primaryKey().generatedAlwaysAsIdentity(),
+  url: varchar({ length: 255 }).notNull(),
+  title: varchar({ length: 255 }).notNull(),
+  description: varchar({ length: 255 }).notNull(),
+  createdAt: timestamp().notNull().defaultNow(),
+  updatedAt: timestamp().notNull().defaultNow(),
+})
--- a/scraper/src/index.ts
+++ b/scraper/src/index.ts
@ -0,0 +1,60 @@
+import { checkIngest } from "./ingest/ingest"
+import { clearScreen, truncate } from "./util/osFunctions"
+import getRandomUrl from "./util/toScrape"
+import * as readline from "readline"
+
+const rl = readline.createInterface({
+  input: process.stdin,
+  output: process.stdout
+})
+
+function promptUser(question: string): Promise<string> {
+  return new Promise((resolve) => {
+    rl.question(question, (answer) => {
+      resolve(answer)
+    })
+  })
+}
+
+checkIngest()
+console.log()
+
+async function main() {
+  while (true) {
+    const url = await getRandomUrl()
+    if (!url) {
+      console.log("No URLs to scrape")
+      rl.close()
+      process.exit(0)
+    }
+
+    clearScreen()
+
+    console.log("┌───────────────────────────────────────────────┐")
+    console.log("│               NAVIGATE SCRAPER                │")
+    console.log("├───────────────────────────────────────────────┤")
+    console.log(`│ URL: ${truncate(url, { length: 40 })}... │`)
+    console.log("┢━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┪")
+    console.log("┃       [S]crape         ┃         [Q]uit       ┃")
+    console.log("┗━━━━━━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━┛\n")
+
+    const input = await promptUser("> ")
+    if (input === "s") {
+      console.log("I would scrape now...")
+    } else if (input === "q") {
+      clearScreen()
+      console.log("\nExiting...\n")
+      rl.close()
+      process.exit(0)
+    } else {
+      clearScreen()
+      console.log("Invalid input. Please enter 's' to scrape or 'q' to quit.\n")
+    }
+  }
+}
+
+main().catch(err => {
+  console.error("[!] Error:", err)
+  rl.close()
+  process.exit(1)
+})
--- a/scraper/src/ingest/ingest.ts
+++ b/scraper/src/ingest/ingest.ts
@ -0,0 +1,66 @@
+import fs from "fs"
+import path from "path"
+import { fileURLToPath } from "url"
+import { drizzle } from "drizzle-orm/node-postgres"
+import { scrapeQueue } from "../db/schema"
+import { NodePgDatabase } from "drizzle-orm/node-postgres"
+import * as schema from "../db/schema"
+
+export function checkIngestCount() {
+  const urls = fs.readFileSync(path.join(path.dirname(fileURLToPath(import.meta.url)), "urls.txt"), "utf8")
+  const urlArray = urls.split("\n")
+
+  // Cleanup
+  const blankLinesRemoved = urlArray.filter((url) => url.trim() !== "")
+  const duplicatesRemoved = blankLinesRemoved.filter((url, index, self) => self.indexOf(url) === index)
+
+  return duplicatesRemoved.length
+}
+
+export async function ingestUrls() {
+  const urls = fs.readFileSync(path.join(path.dirname(fileURLToPath(import.meta.url)), "urls.txt"), "utf8")
+  let urlArray = urls.split("\n")
+
+  // Cleanup
+  const blankLinesRemoved = urlArray.filter((url) => url.trim() !== "")
+  const duplicatesRemoved = blankLinesRemoved.filter((url, index, self) => self.indexOf(url) === index)
+
+  // Ingest
+  const db = drizzle(process.env.DATABASE_URL!) as NodePgDatabase<typeof schema>
+
+  let successCt = 0
+  let failCt = 0
+
+  for (const url of duplicatesRemoved) {
+    try {
+      await db.insert(scrapeQueue).values({
+        url,
+        status: "pending",
+      })
+
+      urlArray = urlArray.filter((u) => u !== url)
+
+      successCt++
+    } catch (error) {
+      console.error(`Failed to ingest: ${url} | ${error}`)
+      failCt++
+    }
+  }
+
+  fs.writeFileSync(path.join(path.dirname(fileURLToPath(import.meta.url)), "urls.txt"), urlArray.join("\n"))
+
+  return {
+    success: successCt,
+    failure: failCt,
+  }
+}
+
+export async function checkIngest() {
+  if (checkIngestCount() === 0) {
+    console.log("[i] No URLs to ingest")
+  } else {
+    console.log(`[i] Ingesting ${checkIngestCount()} URLs`)
+    const { success, failure } = await ingestUrls()
+    console.log(`[✓] Ingested ${success} URLs, failed to ingest ${failure} URLs`)
+  }
+}
--- a/scraper/src/util/osFunctions.ts
+++ b/scraper/src/util/osFunctions.ts
@ -0,0 +1,16 @@
+import { exec } from "child_process"
+
+export function clearScreen() {
+  const os = process.platform
+
+  if (os === "win32") {
+    exec("cls")
+  } else {
+    exec("clear")
+  }
+}
+
+export function truncate(str: string, { length }: { length: number }): string {
+  if (str.length <= length) return str;
+  return str.slice(0, length) + "...";
+}
--- a/scraper/src/util/toScrape.ts
+++ b/scraper/src/util/toScrape.ts
@ -0,0 +1,28 @@
+import "dotenv/config"
+import { drizzle } from "drizzle-orm/node-postgres"
+import { scrapeQueue } from "../db/schema"
+import { eq, asc } from "drizzle-orm"
+import { NodePgDatabase } from "drizzle-orm/node-postgres"
+import * as schema from "../db/schema"
+import { Pool } from "pg"
+
+const pool = new Pool({
+  connectionString: process.env.DATABASE_URL,
+})
+
+const db = drizzle(pool, { schema }) as NodePgDatabase<typeof schema>
+
+async function getRandomUrl() {
+  const url = await db.query.scrapeQueue.findFirst({
+    where: eq(scrapeQueue.status, "pending"),
+    orderBy: [asc(scrapeQueue.createdAt)],
+  })
+
+  if (!url) {
+    return null
+  }
+
+  return url.url
+}
+
+export default getRandomUrl
--- a/scraper/tsconfig.json
+++ b/scraper/tsconfig.json
@ -0,0 +1,28 @@
+{
+  "compilerOptions": {
+    // Environment setup & latest features
+    "lib": ["ESNext"],
+    "target": "ESNext",
+    "module": "ESNext",
+    "moduleDetection": "force",
+    "jsx": "react-jsx",
+    "allowJs": true,
+
+    // Bundler mode
+    "moduleResolution": "bundler",
+    "allowImportingTsExtensions": true,
+    "verbatimModuleSyntax": true,
+    "noEmit": true,
+
+    // Best practices
+    "strict": true,
+    "skipLibCheck": true,
+    "noFallthroughCasesInSwitch": true,
+    "noUncheckedIndexedAccess": true,
+
+    // Some stricter flags (disabled by default)
+    "noUnusedLocals": false,
+    "noUnusedParameters": false,
+    "noPropertyAccessFromIndexSignature": false
+  }
+}