Automated PDF Discovery
Finding papers across databases using APIs and web automation
Understanding Database Patterns
Academic databases fall into three categories, each requiring different scraping strategies.
Examples: PubMed, arXiv
Characteristics:
- Provide official APIs for programmatic access
- Structured JSON/XML responses
- Rate limits but clear documentation
- Most reliable and maintainable approach
Best Strategy: Use official APIs with proper authentication and rate limiting
Examples: JSTOR, IEEE Xplore
Characteristics:
- No official API or severely limited access
- Consistent HTML structure
- Require authentication but stable selectors
- Moderate maintenance required
Best Strategy: Playwright automation with CSS/XPath selectors and session management
Examples: ScienceDirect, Springer
Characteristics:
- Heavy JavaScript rendering
- Aggressive bot detection
- Changing selectors and CAPTCHAs
- High maintenance overhead
Best Strategy: Playwright with stealth plugins, human-like delays, and rotating user agents
API-Based Discovery
PubMed API Integration
PubMed provides the most researcher-friendly API for biomedical literature discovery.
Setup NCBI API Access
Register for an NCBI account to obtain an API key for higher rate limits (10 requests/second vs 3 requests/second).
Visit: https://www.ncbi.nlm.nih.gov/account/
Store your API key in environment variables:
export NCBI_API_KEY="your-api-key-here"Install Dependencies
npm install axios fast-xml-parserImplement PubMed Discovery Class
Create the discovery client with search, parse, and enrichment capabilities:
// pubmed-discovery.js
import axios from 'axios';
class PubMedDiscovery {
constructor() {
this.baseURL = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils';
this.apiKey = process.env.NCBI_API_KEY;
}
async search(query, maxResults = 100, yearFrom = null) {
try {
// Step 1: Search for IDs matching query
const searchParams = {
db: 'pubmed',
term: query,
retmax: maxResults,
retmode: 'json',
api_key: this.apiKey
};
if (yearFrom) {
searchParams.mindate = `${yearFrom}/01/01`;
searchParams.maxdate = `${new Date().getFullYear()}/12/31`;
searchParams.datetype = 'pdat';
}
const searchResponse = await axios.get(`${this.baseURL}/esearch.fcgi`, {
params: searchParams
});
const pmids = searchResponse.data.esearchresult.idlist;
if (pmids.length === 0) {
return { papers: [], total: 0 };
}
// Step 2: Fetch full metadata for each PMID
const detailsResponse = await axios.get(`${this.baseURL}/efetch.fcgi`, {
params: {
db: 'pubmed',
id: pmids.join(','),
retmode: 'xml',
api_key: this.apiKey
}
});
// Step 3: Parse XML to structured data
const papers = await this.parseXML(detailsResponse.data);
// Step 4: Enrich with PDF links from PubMed Central
const enriched = await this.enrichWithPDFLinks(papers);
return {
papers: enriched,
total: pmids.length,
query: query
};
} catch (error) {
console.error('PubMed search failed:', error.message);
throw error;
}
}
async parseXML(xmlData) {
const { XMLParser } = await import('fast-xml-parser');
const parser = new XMLParser({
ignoreAttributes: false,
attributeNamePrefix: '@_'
});
const parsed = parser.parse(xmlData);
const articles = Array.isArray(parsed.PubmedArticleSet?.PubmedArticle)
? parsed.PubmedArticleSet.PubmedArticle
: [parsed.PubmedArticleSet?.PubmedArticle].filter(Boolean);
return articles.map(article => {
const medline = article.MedlineCitation;
const articleData = medline.Article;
return {
pmid: medline.PMID['#text'] || medline.PMID,
title: articleData.ArticleTitle,
abstract: articleData.Abstract?.AbstractText || '',
authors: this.extractAuthors(articleData.AuthorList?.Author || []),
journal: articleData.Journal.Title,
year: articleData.Journal.JournalIssue.PubDate.Year,
doi: this.extractDOI(articleData.ELocationID),
keywords: this.extractKeywords(medline.KeywordList?.Keyword || [])
};
});
}
extractAuthors(authors) {
if (!Array.isArray(authors)) authors = [authors];
return authors.map(author => {
const lastName = author.LastName || '';
const foreName = author.ForeName || author.Initials || '';
return `${lastName}, ${foreName}`.trim();
}).filter(Boolean);
}
extractDOI(eLocationID) {
if (!eLocationID) return null;
const ids = Array.isArray(eLocationID) ? eLocationID : [eLocationID];
const doiEntry = ids.find(id => id['@_EIdType'] === 'doi');
return doiEntry ? doiEntry['#text'] : null;
}
extractKeywords(keywords) {
if (!keywords) return [];
if (!Array.isArray(keywords)) keywords = [keywords];
return keywords.map(k => k['#text'] || k).filter(Boolean);
}
async enrichWithPDFLinks(papers) {
const enriched = [];
for (const paper of papers) {
try {
const pmcResponse = await axios.get(`${this.baseURL}/elink.fcgi`, {
params: {
dbfrom: 'pubmed',
id: paper.pmid,
linkname: 'pubmed_pmc',
retmode: 'json',
api_key: this.apiKey
}
});
const linkset = pmcResponse.data.linksets?.[0];
const pmcid = linkset?.linksetdbs?.[0]?.links?.[0];
if (pmcid) {
paper.pmcid = pmcid;
paper.pdfUrl = `https://www.ncbi.nlm.nih.gov/pmc/articles/PMC${pmcid}/pdf/`;
paper.pdfAvailable = true;
} else {
paper.pdfAvailable = false;
paper.pdfUrl = paper.doi ? `https://doi.org/${paper.doi}` : null;
}
enriched.push(paper);
// Respect API rate limits
await this.delay(100);
} catch (error) {
console.warn(`Failed to enrich PMID ${paper.pmid}:`, error.message);
enriched.push({ ...paper, pdfAvailable: false });
}
}
return enriched;
}
delay(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
}
export default PubMedDiscovery;Test PubMed Discovery
import PubMedDiscovery from './pubmed-discovery.js';
const discovery = new PubMedDiscovery();
const results = await discovery.search('machine learning healthcare', 50, 2020);
console.log(`Found ${results.total} papers`);
console.log(`PDF available: ${results.papers.filter(p => p.pdfAvailable).length}`);PubMed Central (PMC) provides free PDF access for open-access publications. The enrichment step checks PMC availability and constructs direct PDF URLs when available.
arXiv API Integration
arXiv offers a clean API for preprint discovery in physics, mathematics, computer science, and related fields.
Install Dependencies
npm install axios fast-xml-parserImplement arXiv Discovery Class
// arxiv-discovery.js
import axios from 'axios';
class ArXivDiscovery {
constructor() {
this.baseURL = 'http://export.arxiv.org/api/query';
}
async search(query, maxResults = 100, category = null) {
const searchQuery = category
? `cat:${category} AND all:${query}`
: `all:${query}`;
try {
const response = await axios.get(this.baseURL, {
params: {
search_query: searchQuery,
start: 0,
max_results: maxResults,
sortBy: 'submittedDate',
sortOrder: 'descending'
}
});
const papers = await this.parseAtomFeed(response.data);
return { papers, total: papers.length };
} catch (error) {
console.error('arXiv search failed:', error.message);
throw error;
}
}
async parseAtomFeed(atomXML) {
const { XMLParser } = await import('fast-xml-parser');
const parser = new XMLParser({
ignoreAttributes: false,
attributeNamePrefix: '@_'
});
const parsed = parser.parse(atomXML);
const entries = Array.isArray(parsed.feed.entry)
? parsed.feed.entry
: [parsed.feed.entry].filter(Boolean);
return entries.map(entry => ({
arxivId: this.extractArXivId(entry.id),
title: entry.title.replace(/\s+/g, ' ').trim(),
abstract: entry.summary.replace(/\s+/g, ' ').trim(),
authors: this.extractAuthors(entry.author),
published: entry.published,
updated: entry.updated,
categories: this.extractCategories(entry.category),
pdfUrl: this.extractPDFLink(entry.link),
pdfAvailable: true,
doi: entry['arxiv:doi']?.['#text'] || null
}));
}
extractArXivId(id) {
return id.split('/abs/')[1];
}
extractAuthors(authors) {
if (!authors) return [];
if (!Array.isArray(authors)) authors = [authors];
return authors.map(a => a.name);
}
extractCategories(categories) {
if (!categories) return [];
if (!Array.isArray(categories)) categories = [categories];
return categories.map(c => c['@_term']);
}
extractPDFLink(links) {
if (!links) return null;
if (!Array.isArray(links)) links = [links];
const pdfLink = links.find(l => l['@_title'] === 'pdf');
return pdfLink ? pdfLink['@_href'] : null;
}
}
export default ArXivDiscovery;Test arXiv Discovery
import ArXivDiscovery from './arxiv-discovery.js';
const discovery = new ArXivDiscovery();
const results = await discovery.search('neural networks', 50, 'cs.LG');
console.log(`Found ${results.total} papers`);
console.log('All arXiv papers have PDF URLs:', results.papers.every(p => p.pdfUrl));arXiv provides free PDF access for all papers. Every result includes a direct PDF download link. Use category filtering (e.g., cs.LG for machine learning) to narrow results.
Playwright-Based Scraping
JSTOR Discovery
For databases without APIs, Playwright extracts structured data through browser automation.
Install Playwright
npm install playwright
npx playwright install chromiumSave Authenticated Session
JSTOR requires institutional login. Save authentication state once:
import { chromium } from 'playwright';
const browser = await chromium.launch({ headless: false });
const context = await browser.newContext();
const page = await context.newPage();
await page.goto('https://www.jstor.org/action/showLogin');
// Complete login manually in browser
await page.waitForURL('https://www.jstor.org/', { timeout: 120000 });
// Save authentication state
await context.storageState({ path: 'jstor-auth.json' });
await browser.close();Implement JSTOR Discovery Class
// jstor-discovery.js
import { chromium } from 'playwright';
class JSTORDiscovery {
constructor(authStatePath = null) {
this.authStatePath = authStatePath;
}
async search(query, maxResults = 50, yearFrom = null) {
const browser = await chromium.launch({ headless: true });
const contextOptions = {
viewport: { width: 1920, height: 1080 },
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
};
if (this.authStatePath) {
contextOptions.storageState = this.authStatePath;
}
const context = await browser.newContext(contextOptions);
const page = await context.newPage();
try {
// Navigate to advanced search
await page.goto('https://www.jstor.org/action/doAdvancedSearch', {
waitUntil: 'networkidle'
});
// Fill search form
await page.fill('input[name="q0"]', query);
// Add year filter if specified
if (yearFrom) {
await page.selectOption('select[name="f0"]', 'year_range');
await page.fill('input[name="year_from"]', yearFrom.toString());
await page.fill('input[name="year_to"]', new Date().getFullYear().toString());
}
// Submit search
await page.click('button[type="submit"]');
await page.waitForSelector('.result-item, .no-results', { timeout: 30000 });
// Check for no results
const noResults = await page.$('.no-results');
if (noResults) {
await browser.close();
return { papers: [], total: 0 };
}
// Extract all results across multiple pages
const papers = await this.extractAllResults(page, maxResults);
await browser.close();
return { papers, total: papers.length };
} catch (error) {
await page.screenshot({ path: `jstor-error-${Date.now()}.png` });
await browser.close();
throw error;
}
}
async extractAllResults(page, maxResults) {
const allPapers = [];
let currentPage = 1;
while (allPapers.length < maxResults) {
// Extract results from current page
const pageResults = await page.$$eval('.result-item', items =>
items.map(item => ({
title: item.querySelector('.title')?.textContent?.trim(),
authors: item.querySelector('.authors')?.textContent?.trim(),
journal: item.querySelector('.journal-info')?.textContent?.trim(),
year: item.querySelector('.year')?.textContent?.trim(),
doi: item.querySelector('.doi')?.textContent?.trim()?.replace('DOI:', '').trim(),
abstract: item.querySelector('.snippet')?.textContent?.trim(),
url: item.querySelector('a.title')?.href,
stableUrl: item.querySelector('[data-stable-url]')?.getAttribute('data-stable-url')
}))
);
allPapers.push(...pageResults);
if (allPapers.length >= maxResults) break;
// Check for next page
const nextButton = await page.$('a.next-page:not(.disabled)');
if (!nextButton) break;
// Navigate to next page
await nextButton.click();
await page.waitForSelector('.result-item', { timeout: 15000 });
await page.waitForTimeout(1000); // Human-like delay
currentPage++;
}
// Enrich with PDF download links
return this.enrichWithDownloadLinks(allPapers.slice(0, maxResults));
}
enrichWithDownloadLinks(papers) {
return papers.map(paper => {
// JSTOR PDF pattern: /stable/pdf/{id}.pdf
if (paper.stableUrl) {
const stableId = paper.stableUrl.split('/stable/')[1];
paper.pdfUrl = `https://www.jstor.org/stable/pdf/${stableId}.pdf`;
paper.pdfAvailable = true;
} else {
paper.pdfAvailable = false;
}
return paper;
});
}
}
export default JSTORDiscovery;Test JSTOR Discovery
import JSTORDiscovery from './jstor-discovery.js';
const discovery = new JSTORDiscovery('jstor-auth.json');
const results = await discovery.search('digital humanities', 30, 2018);
console.log(`Found ${results.total} papers`);Rate limiting is critical for scraping. Use human-like delays (1-2 seconds between requests) and avoid parallel requests to the same domain. Aggressive scraping may result in IP bans or CAPTCHA challenges.
IEEE Xplore Discovery
IEEE Xplore requires careful handling of its JavaScript-heavy search interface.
Implement IEEE Discovery Class
// ieee-discovery.js
import { chromium } from 'playwright';
class IEEEDiscovery {
async search(query, maxResults = 50) {
const browser = await chromium.launch({ headless: true });
const context = await browser.newContext({
viewport: { width: 1920, height: 1080 }
});
const page = await context.newPage();
try {
await page.goto('https://ieeexplore.ieee.org/Xplore/home.jsp');
// Enter search query
await page.fill('input[placeholder="Search IEEE Xplore"]', query);
await page.keyboard.press('Enter');
// Wait for results
await page.waitForSelector('.List-results-items, .no-results-message', {
timeout: 30000
});
const noResults = await page.$('.no-results-message');
if (noResults) {
await browser.close();
return { papers: [], total: 0 };
}
// Extract paper metadata
const papers = await this.extractResults(page, maxResults);
await browser.close();
return { papers, total: papers.length };
} catch (error) {
await page.screenshot({ path: `ieee-error-${Date.now()}.png` });
await browser.close();
throw error;
}
}
async extractResults(page, maxResults) {
const results = [];
while (results.length < maxResults) {
const pageResults = await page.$$eval('.List-results-items xpl-results-item', items =>
items.map(item => {
const titleEl = item.querySelector('h3 a');
return {
title: titleEl?.textContent?.trim(),
url: titleEl?.href,
authors: Array.from(item.querySelectorAll('.author'))
.map(a => a.textContent.trim()),
abstract: item.querySelector('.abstract-text')?.textContent?.trim(),
year: item.querySelector('.publisher-info-container')?.textContent
?.match(/\b(19|20)\d{2}\b/)?.[0],
doi: item.querySelector('.stats-document-abstract-doi a')?.textContent?.trim(),
documentId: titleEl?.href?.match(/document\/(\d+)/)?.[1]
};
})
);
results.push(...pageResults.filter(r => r.title));
if (results.length >= maxResults) break;
// Check for pagination
const nextButton = await page.$('button[aria-label="Next page"]');
if (!nextButton || await nextButton.isDisabled()) break;
await nextButton.click();
await page.waitForTimeout(2000);
await page.waitForSelector('.List-results-items');
}
return this.enrichWithPDFLinks(results.slice(0, maxResults));
}
enrichWithPDFLinks(papers) {
return papers.map(paper => {
if (paper.documentId) {
paper.pdfUrl = `https://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&arnumber=${paper.documentId}`;
paper.pdfAvailable = true;
} else {
paper.pdfAvailable = false;
}
return paper;
});
}
}
export default IEEEDiscovery;Test IEEE Discovery
import IEEEDiscovery from './ieee-discovery.js';
const discovery = new IEEEDiscovery();
const results = await discovery.search('deep learning computer vision', 40);
console.log(`Found ${results.total} papers`);Multi-Database Deduplication
Searching multiple databases returns duplicates. Smart deduplication merges results while preserving alternative PDF sources.
Implement Deduplication Strategy
// deduplication.js
class PaperDeduplicator {
/**
* Deduplicate papers using multiple strategies
*/
deduplicate(papers) {
const seen = new Map();
const unique = [];
for (const paper of papers) {
const key = this.generateKey(paper);
if (!seen.has(key)) {
seen.set(key, paper);
unique.push(paper);
} else {
// Merge metadata from duplicate
const existing = seen.get(key);
this.mergePaperData(existing, paper);
}
}
return unique;
}
/**
* Generate deduplication key using multiple identifiers
*/
generateKey(paper) {
// Priority 1: DOI (most reliable)
if (paper.doi) {
return `doi:${this.normalizeDOI(paper.doi)}`;
}
// Priority 2: arXiv ID
if (paper.arxivId) {
return `arxiv:${paper.arxivId}`;
}
// Priority 3: PubMed ID
if (paper.pmid) {
return `pmid:${paper.pmid}`;
}
// Priority 4: Title + First Author
const normalizedTitle = this.normalizeTitle(paper.title);
const firstAuthor = this.extractFirstAuthor(paper.authors);
return `title:${normalizedTitle}:${firstAuthor}`;
}
normalizeDOI(doi) {
return doi.toLowerCase()
.replace(/^https?:\/\/doi\.org\//i, '')
.replace(/^doi:\s*/i, '')
.trim();
}
normalizeTitle(title) {
return title
.toLowerCase()
.replace(/[^\w\s]/g, '') // Remove punctuation
.replace(/\s+/g, ' ') // Normalize whitespace
.trim();
}
extractFirstAuthor(authors) {
if (!authors || authors.length === 0) return '';
const first = Array.isArray(authors) ? authors[0] : authors;
return first.toLowerCase().replace(/[^\w]/g, '');
}
/**
* Merge metadata from duplicate papers
*/
mergePaperData(existing, duplicate) {
// Add alternative PDF sources
if (duplicate.pdfUrl && !existing.alternativePDFUrls) {
existing.alternativePDFUrls = [];
}
if (duplicate.pdfUrl && duplicate.pdfUrl !== existing.pdfUrl) {
existing.alternativePDFUrls.push(duplicate.pdfUrl);
}
// Merge keywords
if (duplicate.keywords) {
existing.keywords = [
...(existing.keywords || []),
...duplicate.keywords
].filter((v, i, a) => a.indexOf(v) === i); // Unique
}
// Keep more complete abstract
if (duplicate.abstract && duplicate.abstract.length > (existing.abstract?.length || 0)) {
existing.abstract = duplicate.abstract;
}
// Track sources
if (!existing.sources) existing.sources = [];
if (duplicate.source) existing.sources.push(duplicate.source);
}
}
export default PaperDeduplicator;Use Deduplication in Multi-Database Search
import PubMedDiscovery from './pubmed-discovery.js';
import ArXivDiscovery from './arxiv-discovery.js';
import JSTORDiscovery from './jstor-discovery.js';
import PaperDeduplicator from './deduplication.js';
async function searchMultipleDatabases(query) {
const pubmed = new PubMedDiscovery();
const arxiv = new ArXivDiscovery();
const jstor = new JSTORDiscovery('jstor-auth.json');
const deduplicator = new PaperDeduplicator();
// Search all databases in parallel
const [pubmedResults, arxivResults, jstorResults] = await Promise.all([
pubmed.search(query, 50).catch(() => ({ papers: [] })),
arxiv.search(query, 50).catch(() => ({ papers: [] })),
jstor.search(query, 50).catch(() => ({ papers: [] }))
]);
// Tag papers with source database
pubmedResults.papers.forEach(p => p.source = 'pubmed');
arxivResults.papers.forEach(p => p.source = 'arxiv');
jstorResults.papers.forEach(p => p.source = 'jstor');
// Combine and deduplicate
const allPapers = [
...pubmedResults.papers,
...arxivResults.papers,
...jstorResults.papers
];
const uniquePapers = deduplicator.deduplicate(allPapers);
console.log(`Total papers found: ${allPapers.length}`);
console.log(`Unique papers after deduplication: ${uniquePapers.length}`);
return uniquePapers;
}Deduplication priority hierarchy: DOI (most reliable) → arXiv ID → PubMed ID → Title + First Author. This ensures accurate matching while preserving alternative PDF sources from different databases.