diff --git a/scraper/package.json b/scraper/package.json index 7cd33b1..ffe787e 100644 --- a/scraper/package.json +++ b/scraper/package.json @@ -20,6 +20,7 @@ "cheerio": "^1.0.0", "dotenv": "^16.5.0", "drizzle-orm": "^0.42.0", - "pg": "^8.15.5" + "pg": "^8.15.5", + "robots-txt-parser": "^2.0.3" } } diff --git a/scraper/src/db/schema.ts b/scraper/src/db/schema.ts index 6327eab..1c20249 100644 --- a/scraper/src/db/schema.ts +++ b/scraper/src/db/schema.ts @@ -1,4 +1,4 @@ -import { integer, pgTable, varchar, timestamp } from "drizzle-orm/pg-core" +import { integer, pgTable, varchar, timestamp, text } from "drizzle-orm/pg-core" export const scrapeQueue = pgTable("scrape_queue", { id: integer().primaryKey().generatedAlwaysAsIdentity(), @@ -13,6 +13,8 @@ export const searchData = pgTable("search_data", { url: varchar({ length: 255 }).notNull(), title: varchar({ length: 255 }).notNull(), description: varchar({ length: 255 }).notNull(), + imageUrl: varchar({ length: 255 }).notNull(), + keywords: text("text[]").array(), createdAt: timestamp().notNull().defaultNow(), updatedAt: timestamp().notNull().defaultNow(), }) \ No newline at end of file diff --git a/scraper/src/index.ts b/scraper/src/index.ts index 1dda13a..6769ec1 100644 --- a/scraper/src/index.ts +++ b/scraper/src/index.ts @@ -1,5 +1,7 @@ import { checkIngest } from "./ingest/ingest" +import checkIfScrapeAllowed from "./safety/preCheck" import { clearScreen, truncate } from "./util/osFunctions" +import { saveToDatabase, scrapeUrl } from "./util/scrape" import getRandomUrl from "./util/toScrape" import * as readline from "readline" @@ -16,6 +18,7 @@ function promptUser(question: string): Promise { }) } +clearScreen() checkIngest() console.log() @@ -34,13 +37,25 @@ async function main() { console.log("│ NAVIGATE SCRAPER │") console.log("├───────────────────────────────────────────────┤") console.log(`│ URL: ${truncate(url, { length: 40 })}... │`) + console.log("│ Pre-check: ", await checkIfScrapeAllowed(url) ? "Allowed" : "Blocked", " │") console.log("┢━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┪") console.log("┃ [S]crape ┃ [Q]uit ┃") console.log("┗━━━━━━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━┛\n") const input = await promptUser("> ") if (input === "s") { - console.log("I would scrape now...") + const { title, description, imageUrl, keywordsArray } = await scrapeUrl(url) + + console.log(`\nTitle: ${title}`) + console.log(`Description: ${description}`) + console.log(`Image URL: ${imageUrl}`) + console.log(`Keywords: ${keywordsArray.join(", ")}\n`) + + console.log("Save to database? (y/n)") + const save = await promptUser("> ") + if (save === "y") { + await saveToDatabase(url, title, description, imageUrl, keywordsArray) + } } else if (input === "q") { clearScreen() console.log("\nExiting...\n") diff --git a/scraper/src/safety/preCheck.ts b/scraper/src/safety/preCheck.ts new file mode 100644 index 0000000..a5879af --- /dev/null +++ b/scraper/src/safety/preCheck.ts @@ -0,0 +1,18 @@ +import robotsParser from 'robots-txt-parser'; + +const robots = robotsParser( + { + userAgent: 'NavigateBot', + allowOnNeutral: false, + }, +) + +export default async function checkIfScrapeAllowed(url: string) { + try { + await robots.useRobotsFor(url) + return robots.canCrawl(url) + } catch (error) { + console.error("[!]", error) + return false + } +} \ No newline at end of file diff --git a/scraper/src/util/scrape.ts b/scraper/src/util/scrape.ts new file mode 100644 index 0000000..f2ea187 --- /dev/null +++ b/scraper/src/util/scrape.ts @@ -0,0 +1,43 @@ +import axios from "axios" +import "dotenv/config" +import { NodePgDatabase } from "drizzle-orm/node-postgres" +import { drizzle } from "drizzle-orm/node-postgres" +import { searchData } from "../db/schema" +import * as cheerio from "cheerio" +import * as schema from "../db/schema" +import { Pool } from "pg" + +const pool = new Pool({ + connectionString: process.env.DATABASE_URL, +}) + +const db = drizzle(pool, { schema }) as NodePgDatabase + +export async function scrapeUrl(url: string) { + const response = await axios.get(url) + const $ = cheerio.load(response.data) + + // Data + const title = $("title").text() + const description = $("meta[name='description']").attr("content") || "" + const imageUrl = $("meta[property='og:image']").attr("content") || "" + const keywords = $("meta[name='keywords']").attr("content") + + // Extract keywords + let keywordsArray: string[] = [] + if (keywords) { + keywordsArray = keywords.split(",") + } + + return { title, description, imageUrl, keywordsArray } +} + +export async function saveToDatabase(url: string, title: string, description: string, imageUrl: string, keywordsArray: string[]) { + await db.insert(searchData).values({ + url, + title, + description, + imageUrl, + keywords: keywordsArray, + }) +} \ No newline at end of file