[scraper] func: add database, docker support, ingest functions, and basic menu
This commit is contained in:
commit
47d8470372
24
UNLICENSE
Normal file
24
UNLICENSE
Normal file
@ -0,0 +1,24 @@
|
||||
This is free and unencumbered software released into the public domain.
|
||||
|
||||
Anyone is free to copy, modify, publish, use, compile, sell, or
|
||||
distribute this software, either in source code form or as a compiled
|
||||
binary, for any purpose, commercial or non-commercial, and by any
|
||||
means.
|
||||
|
||||
In jurisdictions that recognize copyright laws, the author or authors
|
||||
of this software dedicate any and all copyright interest in the
|
||||
software to the public domain. We make this dedication for the benefit
|
||||
of the public at large and to the detriment of our heirs and
|
||||
successors. We intend this dedication to be an overt act of
|
||||
relinquishment in perpetuity of all present and future rights to this
|
||||
software under copyright law.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
For more information, please refer to <https://unlicense.org/>
|
43
scraper/.gitignore
vendored
Normal file
43
scraper/.gitignore
vendored
Normal file
@ -0,0 +1,43 @@
|
||||
# dependencies (bun install)
|
||||
node_modules
|
||||
|
||||
# output
|
||||
out
|
||||
dist
|
||||
*.tgz
|
||||
|
||||
# code coverage
|
||||
coverage
|
||||
*.lcov
|
||||
|
||||
# logs
|
||||
logs
|
||||
_.log
|
||||
report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json
|
||||
|
||||
# dotenv environment variable files
|
||||
.env
|
||||
.env.development.local
|
||||
.env.test.local
|
||||
.env.production.local
|
||||
.env.local
|
||||
|
||||
# caches
|
||||
.eslintcache
|
||||
.cache
|
||||
*.tsbuildinfo
|
||||
|
||||
# IntelliJ based IDEs
|
||||
.idea
|
||||
|
||||
# Finder (MacOS) folder config
|
||||
.DS_Store
|
||||
|
||||
# ingest files
|
||||
src/ingest/urls.txt
|
||||
|
||||
# postgres
|
||||
postgres
|
||||
|
||||
# bun
|
||||
bun.lock*
|
10
scraper/docker-compose.yml
Normal file
10
scraper/docker-compose.yml
Normal file
@ -0,0 +1,10 @@
|
||||
services:
|
||||
postgres:
|
||||
image: postgres:17
|
||||
environment:
|
||||
POSTGRES_PASSWORD: abfi29239ed98q93aEa89EriiaKaye896quAirhaAu
|
||||
POSTGRES_DB: scraper
|
||||
ports:
|
||||
- 5432:5432
|
||||
volumes:
|
||||
- ./postgres:/var/lib/postgresql/data
|
11
scraper/drizzle.config.ts
Normal file
11
scraper/drizzle.config.ts
Normal file
@ -0,0 +1,11 @@
|
||||
import "dotenv/config"
|
||||
import { defineConfig } from "drizzle-kit"
|
||||
|
||||
export default defineConfig({
|
||||
out: "./drizzle",
|
||||
schema: "./src/db/schema.ts",
|
||||
dialect: "postgresql",
|
||||
dbCredentials: {
|
||||
url: process.env.DATABASE_URL!,
|
||||
},
|
||||
})
|
25
scraper/package.json
Normal file
25
scraper/package.json
Normal file
@ -0,0 +1,25 @@
|
||||
{
|
||||
"name": "navigate",
|
||||
"module": "src/index.ts",
|
||||
"type": "module",
|
||||
"private": true,
|
||||
"scripts": {
|
||||
"start": "tsx src/index.ts"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/bun": "latest",
|
||||
"@types/pg": "^8.11.13",
|
||||
"drizzle-kit": "^0.31.0",
|
||||
"tsx": "^4.19.3"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"typescript": "^5"
|
||||
},
|
||||
"dependencies": {
|
||||
"axios": "^1.8.4",
|
||||
"cheerio": "^1.0.0",
|
||||
"dotenv": "^16.5.0",
|
||||
"drizzle-orm": "^0.42.0",
|
||||
"pg": "^8.15.5"
|
||||
}
|
||||
}
|
18
scraper/src/db/schema.ts
Normal file
18
scraper/src/db/schema.ts
Normal file
@ -0,0 +1,18 @@
|
||||
import { integer, pgTable, varchar, timestamp } from "drizzle-orm/pg-core"
|
||||
|
||||
export const scrapeQueue = pgTable("scrape_queue", {
|
||||
id: integer().primaryKey().generatedAlwaysAsIdentity(),
|
||||
url: varchar({ length: 255 }).notNull(),
|
||||
status: varchar({ length: 255 }).notNull(),
|
||||
createdAt: timestamp().notNull().defaultNow(),
|
||||
updatedAt: timestamp().notNull().defaultNow(),
|
||||
})
|
||||
|
||||
export const searchData = pgTable("search_data", {
|
||||
id: integer().primaryKey().generatedAlwaysAsIdentity(),
|
||||
url: varchar({ length: 255 }).notNull(),
|
||||
title: varchar({ length: 255 }).notNull(),
|
||||
description: varchar({ length: 255 }).notNull(),
|
||||
createdAt: timestamp().notNull().defaultNow(),
|
||||
updatedAt: timestamp().notNull().defaultNow(),
|
||||
})
|
60
scraper/src/index.ts
Normal file
60
scraper/src/index.ts
Normal file
@ -0,0 +1,60 @@
|
||||
import { checkIngest } from "./ingest/ingest"
|
||||
import { clearScreen, truncate } from "./util/osFunctions"
|
||||
import getRandomUrl from "./util/toScrape"
|
||||
import * as readline from "readline"
|
||||
|
||||
const rl = readline.createInterface({
|
||||
input: process.stdin,
|
||||
output: process.stdout
|
||||
})
|
||||
|
||||
function promptUser(question: string): Promise<string> {
|
||||
return new Promise((resolve) => {
|
||||
rl.question(question, (answer) => {
|
||||
resolve(answer)
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
checkIngest()
|
||||
console.log()
|
||||
|
||||
async function main() {
|
||||
while (true) {
|
||||
const url = await getRandomUrl()
|
||||
if (!url) {
|
||||
console.log("No URLs to scrape")
|
||||
rl.close()
|
||||
process.exit(0)
|
||||
}
|
||||
|
||||
clearScreen()
|
||||
|
||||
console.log("┌───────────────────────────────────────────────┐")
|
||||
console.log("│ NAVIGATE SCRAPER │")
|
||||
console.log("├───────────────────────────────────────────────┤")
|
||||
console.log(`│ URL: ${truncate(url, { length: 40 })}... │`)
|
||||
console.log("┢━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┪")
|
||||
console.log("┃ [S]crape ┃ [Q]uit ┃")
|
||||
console.log("┗━━━━━━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━┛\n")
|
||||
|
||||
const input = await promptUser("> ")
|
||||
if (input === "s") {
|
||||
console.log("I would scrape now...")
|
||||
} else if (input === "q") {
|
||||
clearScreen()
|
||||
console.log("\nExiting...\n")
|
||||
rl.close()
|
||||
process.exit(0)
|
||||
} else {
|
||||
clearScreen()
|
||||
console.log("Invalid input. Please enter 's' to scrape or 'q' to quit.\n")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(err => {
|
||||
console.error("[!] Error:", err)
|
||||
rl.close()
|
||||
process.exit(1)
|
||||
})
|
66
scraper/src/ingest/ingest.ts
Normal file
66
scraper/src/ingest/ingest.ts
Normal file
@ -0,0 +1,66 @@
|
||||
import fs from "fs"
|
||||
import path from "path"
|
||||
import { fileURLToPath } from "url"
|
||||
import { drizzle } from "drizzle-orm/node-postgres"
|
||||
import { scrapeQueue } from "../db/schema"
|
||||
import { NodePgDatabase } from "drizzle-orm/node-postgres"
|
||||
import * as schema from "../db/schema"
|
||||
|
||||
export function checkIngestCount() {
|
||||
const urls = fs.readFileSync(path.join(path.dirname(fileURLToPath(import.meta.url)), "urls.txt"), "utf8")
|
||||
const urlArray = urls.split("\n")
|
||||
|
||||
// Cleanup
|
||||
const blankLinesRemoved = urlArray.filter((url) => url.trim() !== "")
|
||||
const duplicatesRemoved = blankLinesRemoved.filter((url, index, self) => self.indexOf(url) === index)
|
||||
|
||||
return duplicatesRemoved.length
|
||||
}
|
||||
|
||||
export async function ingestUrls() {
|
||||
const urls = fs.readFileSync(path.join(path.dirname(fileURLToPath(import.meta.url)), "urls.txt"), "utf8")
|
||||
let urlArray = urls.split("\n")
|
||||
|
||||
// Cleanup
|
||||
const blankLinesRemoved = urlArray.filter((url) => url.trim() !== "")
|
||||
const duplicatesRemoved = blankLinesRemoved.filter((url, index, self) => self.indexOf(url) === index)
|
||||
|
||||
// Ingest
|
||||
const db = drizzle(process.env.DATABASE_URL!) as NodePgDatabase<typeof schema>
|
||||
|
||||
let successCt = 0
|
||||
let failCt = 0
|
||||
|
||||
for (const url of duplicatesRemoved) {
|
||||
try {
|
||||
await db.insert(scrapeQueue).values({
|
||||
url,
|
||||
status: "pending",
|
||||
})
|
||||
|
||||
urlArray = urlArray.filter((u) => u !== url)
|
||||
|
||||
successCt++
|
||||
} catch (error) {
|
||||
console.error(`Failed to ingest: ${url} | ${error}`)
|
||||
failCt++
|
||||
}
|
||||
}
|
||||
|
||||
fs.writeFileSync(path.join(path.dirname(fileURLToPath(import.meta.url)), "urls.txt"), urlArray.join("\n"))
|
||||
|
||||
return {
|
||||
success: successCt,
|
||||
failure: failCt,
|
||||
}
|
||||
}
|
||||
|
||||
export async function checkIngest() {
|
||||
if (checkIngestCount() === 0) {
|
||||
console.log("[i] No URLs to ingest")
|
||||
} else {
|
||||
console.log(`[i] Ingesting ${checkIngestCount()} URLs`)
|
||||
const { success, failure } = await ingestUrls()
|
||||
console.log(`[✓] Ingested ${success} URLs, failed to ingest ${failure} URLs`)
|
||||
}
|
||||
}
|
16
scraper/src/util/osFunctions.ts
Normal file
16
scraper/src/util/osFunctions.ts
Normal file
@ -0,0 +1,16 @@
|
||||
import { exec } from "child_process"
|
||||
|
||||
export function clearScreen() {
|
||||
const os = process.platform
|
||||
|
||||
if (os === "win32") {
|
||||
exec("cls")
|
||||
} else {
|
||||
exec("clear")
|
||||
}
|
||||
}
|
||||
|
||||
export function truncate(str: string, { length }: { length: number }): string {
|
||||
if (str.length <= length) return str;
|
||||
return str.slice(0, length) + "...";
|
||||
}
|
28
scraper/src/util/toScrape.ts
Normal file
28
scraper/src/util/toScrape.ts
Normal file
@ -0,0 +1,28 @@
|
||||
import "dotenv/config"
|
||||
import { drizzle } from "drizzle-orm/node-postgres"
|
||||
import { scrapeQueue } from "../db/schema"
|
||||
import { eq, asc } from "drizzle-orm"
|
||||
import { NodePgDatabase } from "drizzle-orm/node-postgres"
|
||||
import * as schema from "../db/schema"
|
||||
import { Pool } from "pg"
|
||||
|
||||
const pool = new Pool({
|
||||
connectionString: process.env.DATABASE_URL,
|
||||
})
|
||||
|
||||
const db = drizzle(pool, { schema }) as NodePgDatabase<typeof schema>
|
||||
|
||||
async function getRandomUrl() {
|
||||
const url = await db.query.scrapeQueue.findFirst({
|
||||
where: eq(scrapeQueue.status, "pending"),
|
||||
orderBy: [asc(scrapeQueue.createdAt)],
|
||||
})
|
||||
|
||||
if (!url) {
|
||||
return null
|
||||
}
|
||||
|
||||
return url.url
|
||||
}
|
||||
|
||||
export default getRandomUrl
|
28
scraper/tsconfig.json
Normal file
28
scraper/tsconfig.json
Normal file
@ -0,0 +1,28 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
// Environment setup & latest features
|
||||
"lib": ["ESNext"],
|
||||
"target": "ESNext",
|
||||
"module": "ESNext",
|
||||
"moduleDetection": "force",
|
||||
"jsx": "react-jsx",
|
||||
"allowJs": true,
|
||||
|
||||
// Bundler mode
|
||||
"moduleResolution": "bundler",
|
||||
"allowImportingTsExtensions": true,
|
||||
"verbatimModuleSyntax": true,
|
||||
"noEmit": true,
|
||||
|
||||
// Best practices
|
||||
"strict": true,
|
||||
"skipLibCheck": true,
|
||||
"noFallthroughCasesInSwitch": true,
|
||||
"noUncheckedIndexedAccess": true,
|
||||
|
||||
// Some stricter flags (disabled by default)
|
||||
"noUnusedLocals": false,
|
||||
"noUnusedParameters": false,
|
||||
"noPropertyAccessFromIndexSignature": false
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user