feat: add extraction+saving of title, description, image, and keywords to db, add robots check before scraping

This commit is contained in:
Aidan 2025-04-23 21:45:37 -04:00
parent 47d8470372
commit b058ca23c0
5 changed files with 82 additions and 3 deletions

View File

@ -20,6 +20,7 @@
"cheerio": "^1.0.0",
"dotenv": "^16.5.0",
"drizzle-orm": "^0.42.0",
"pg": "^8.15.5"
"pg": "^8.15.5",
"robots-txt-parser": "^2.0.3"
}
}

View File

@ -1,4 +1,4 @@
import { integer, pgTable, varchar, timestamp } from "drizzle-orm/pg-core"
import { integer, pgTable, varchar, timestamp, text } from "drizzle-orm/pg-core"
export const scrapeQueue = pgTable("scrape_queue", {
id: integer().primaryKey().generatedAlwaysAsIdentity(),
@ -13,6 +13,8 @@ export const searchData = pgTable("search_data", {
url: varchar({ length: 255 }).notNull(),
title: varchar({ length: 255 }).notNull(),
description: varchar({ length: 255 }).notNull(),
imageUrl: varchar({ length: 255 }).notNull(),
keywords: text("text[]").array(),
createdAt: timestamp().notNull().defaultNow(),
updatedAt: timestamp().notNull().defaultNow(),
})

View File

@ -1,5 +1,7 @@
import { checkIngest } from "./ingest/ingest"
import checkIfScrapeAllowed from "./safety/preCheck"
import { clearScreen, truncate } from "./util/osFunctions"
import { saveToDatabase, scrapeUrl } from "./util/scrape"
import getRandomUrl from "./util/toScrape"
import * as readline from "readline"
@ -16,6 +18,7 @@ function promptUser(question: string): Promise<string> {
})
}
clearScreen()
checkIngest()
console.log()
@ -34,13 +37,25 @@ async function main() {
console.log("│ NAVIGATE SCRAPER │")
console.log("├───────────────────────────────────────────────┤")
console.log(`│ URL: ${truncate(url, { length: 40 })}... │`)
console.log("│ Pre-check: ", await checkIfScrapeAllowed(url) ? "Allowed" : "Blocked", " │")
console.log("┢━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┪")
console.log("┃ [S]crape ┃ [Q]uit ┃")
console.log("┗━━━━━━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━┛\n")
const input = await promptUser("> ")
if (input === "s") {
console.log("I would scrape now...")
const { title, description, imageUrl, keywordsArray } = await scrapeUrl(url)
console.log(`\nTitle: ${title}`)
console.log(`Description: ${description}`)
console.log(`Image URL: ${imageUrl}`)
console.log(`Keywords: ${keywordsArray.join(", ")}\n`)
console.log("Save to database? (y/n)")
const save = await promptUser("> ")
if (save === "y") {
await saveToDatabase(url, title, description, imageUrl, keywordsArray)
}
} else if (input === "q") {
clearScreen()
console.log("\nExiting...\n")

View File

@ -0,0 +1,18 @@
import robotsParser from 'robots-txt-parser';
const robots = robotsParser(
{
userAgent: 'NavigateBot',
allowOnNeutral: false,
},
)
export default async function checkIfScrapeAllowed(url: string) {
try {
await robots.useRobotsFor(url)
return robots.canCrawl(url)
} catch (error) {
console.error("[!]", error)
return false
}
}

View File

@ -0,0 +1,43 @@
import axios from "axios"
import "dotenv/config"
import { NodePgDatabase } from "drizzle-orm/node-postgres"
import { drizzle } from "drizzle-orm/node-postgres"
import { searchData } from "../db/schema"
import * as cheerio from "cheerio"
import * as schema from "../db/schema"
import { Pool } from "pg"
const pool = new Pool({
connectionString: process.env.DATABASE_URL,
})
const db = drizzle(pool, { schema }) as NodePgDatabase<typeof schema>
export async function scrapeUrl(url: string) {
const response = await axios.get(url)
const $ = cheerio.load(response.data)
// Data
const title = $("title").text()
const description = $("meta[name='description']").attr("content") || ""
const imageUrl = $("meta[property='og:image']").attr("content") || ""
const keywords = $("meta[name='keywords']").attr("content")
// Extract keywords
let keywordsArray: string[] = []
if (keywords) {
keywordsArray = keywords.split(",")
}
return { title, description, imageUrl, keywordsArray }
}
export async function saveToDatabase(url: string, title: string, description: string, imageUrl: string, keywordsArray: string[]) {
await db.insert(searchData).values({
url,
title,
description,
imageUrl,
keywords: keywordsArray,
})
}