diff --git a/packages/components/src/utils.ts b/packages/components/src/utils.ts index 548632ea2..e7977809f 100644 --- a/packages/components/src/utils.ts +++ b/packages/components/src/utils.ts @@ -290,22 +290,12 @@ function getURLsFromHTML(htmlBody: string, baseURL: string): string[] { const linkElements = dom.window.document.querySelectorAll('a') const urls: string[] = [] for (const linkElement of linkElements) { - if (linkElement.href.slice(0, 1) === '/') { - try { - const urlObj = new URL(baseURL + linkElement.href) - urls.push(urlObj.href) //relative - } catch (err) { - if (process.env.DEBUG === 'true') console.error(`error with relative url: ${err.message}`) - continue - } - } else { - try { - const urlObj = new URL(linkElement.href) - urls.push(urlObj.href) //absolute - } catch (err) { - if (process.env.DEBUG === 'true') console.error(`error with absolute url: ${err.message}`) - continue - } + try { + const urlObj = new URL(linkElement.href, baseURL) + urls.push(urlObj.href) + } catch (err) { + if (process.env.DEBUG === 'true') console.error(`error with scraped URL: ${err.message}`) + continue } } return urls @@ -365,7 +355,7 @@ async function crawl(baseURL: string, currentURL: string, pages: string[], limit } const htmlBody = await resp.text() - const nextURLs = getURLsFromHTML(htmlBody, baseURL) + const nextURLs = getURLsFromHTML(htmlBody, currentURL) for (const nextURL of nextURLs) { pages = await crawl(baseURL, nextURL, pages, limit) }