sematic scholar api key support

This commit is contained in:
Ladd Hoffman 2024-04-04 11:56:58 -05:00
parent 68d04117c9
commit c80f2ee79b
6 changed files with 70 additions and 49 deletions

View File

@ -0,0 +1 @@
SEMANTIC_SCHOLAR_API_KEY=

View File

@ -1 +1,2 @@
/target /target
.env

View File

@ -266,6 +266,12 @@ dependencies = [
"subtle", "subtle",
] ]
[[package]]
name = "dotenv"
version = "0.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77c90badedccf4105eca100756a0b1289e191f6fcbdadd3cee1d2f614f97da8f"
[[package]] [[package]]
name = "encoding_rs" name = "encoding_rs"
version = "0.8.31" version = "0.8.31"
@ -1206,6 +1212,7 @@ version = "0.1.0"
dependencies = [ dependencies = [
"async-recursion", "async-recursion",
"clap", "clap",
"dotenv",
"mongodb", "mongodb",
"reqwest", "reqwest",
"serde", "serde",

View File

@ -2,12 +2,14 @@
name = "semantic-scholar-client" name = "semantic-scholar-client"
version = "0.1.0" version = "0.1.0"
edition = "2021" edition = "2021"
default-run = "import"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
async-recursion = "1.0.0" async-recursion = "1.0.0"
clap = { version = "3.2.11", features = ["derive"] } clap = { version = "3.2.11", features = ["derive"] }
dotenv = "0.15.0"
mongodb = "2.2.2" mongodb = "2.2.2"
reqwest = { version = "0.11.11", features = ["json"] } reqwest = { version = "0.11.11", features = ["json"] }
serde = { version = "1.0.139", features = ["derive"] } serde = { version = "1.0.139", features = ["derive"] }

View File

@ -8,6 +8,9 @@ Work in progress to pipe this data into an operating database.
### Usage ### Usage
* (Optional) Copy `.env.example` to `.env` and set the value of `SEMANTIC_SCHOLAR_API_KEY`
* Run the program
cargo run -- --paper-id <paper_id> --depth <depth> cargo run -- --paper-id <paper_id> --depth <depth>
* `paper_id` values are in accordance with [Semantic Scholar API](https://api.semanticscholar.org/api-docs/). * `paper_id` values are in accordance with [Semantic Scholar API](https://api.semanticscholar.org/api-docs/).
@ -19,3 +22,4 @@ Ideas for followup work:
- Consider strategies for deciding where to terminate a given traversal - Consider strategies for deciding where to terminate a given traversal
- Provide an HTTP/WebSocket interface that can be used to talk to this process during its operation. - Provide an HTTP/WebSocket interface that can be used to talk to this process during its operation.
This can enable us to pipe the data to other tasks, to monitor, to start/stop, and even to make configuration changes. This can enable us to pipe the data to other tasks, to monitor, to start/stop, and even to make configuration changes.
- Rate limit requests

View File

@ -3,10 +3,11 @@
use async_recursion::async_recursion; use async_recursion::async_recursion;
use clap::Parser; use clap::Parser;
use std::cmp::min; use dotenv::dotenv;
use std::fmt::Write;
use std::error::Error;
use serde::Deserialize; use serde::Deserialize;
use std::cmp::min;
use std::error::Error;
use std::fmt::Write;
type DataResult<T> = Result<T, Box<dyn Error>>; type DataResult<T> = Result<T, Box<dyn Error>>;
@ -23,14 +24,13 @@ struct Args {
/// Starting paper. We will traverse papers that cite this one /// Starting paper. We will traverse papers that cite this one
#[clap(short, long, value_parser)] #[clap(short, long, value_parser)]
paper_id: String, paper_id: String,
// Write the results to MongoDB // Write the results to MongoDB
// #[clap(short, long, value_parser)] // #[clap(short, long, value_parser)]
// write_to_mongo: bool, // write_to_mongo: bool,
} }
struct Author { struct Author {
name: String name: String,
} }
type Authors = Vec<Author>; type Authors = Vec<Author>;
@ -56,22 +56,28 @@ struct CitingPaper {
#[derive(Deserialize, Debug)] #[derive(Deserialize, Debug)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
struct Citation { struct Citation {
citing_paper: CitingPaper citing_paper: CitingPaper,
} }
/** /**
* Generic struct to wrap the common API response pattern {data: [...]} code: Option<String>,
*/ * Generic struct to wrap the common API response pattern {data: [...]}
*/
#[derive(Deserialize, Debug)] #[derive(Deserialize, Debug)]
struct ApiListResponse<T> { struct ApiListResponse<T> {
data: Vec<T> data: Option<Vec<T>>,
message: Option<String>,
} }
// TODO: Cache results in a (separate but local) database such as Redis // TODO: Cache results in a (separate but local) database such as Redis
// TODO: Store results in a (separate but local) database such as Postgres // TODO: Store results in a (separate but local) database such as Postgres
#[async_recursion] #[async_recursion]
async fn get_citations(paper_id: String, depth: u32, authors: &mut Vec<Author>) -> DataResult<Vec<Citation>> { async fn get_citations(
client: &reqwest::Client,
paper_id: String,
depth: u32,
authors: &mut Vec<Author>,
) -> DataResult<Vec<Citation>> {
// Bound recursion to some depth // Bound recursion to some depth
if depth > MAX_DEPTH { if depth > MAX_DEPTH {
return Ok(vec![]); return Ok(vec![]);
@ -81,12 +87,15 @@ async fn get_citations(paper_id: String, depth: u32, authors: &mut Vec<Author>)
let mut url = String::new(); let mut url = String::new();
write!(&mut url, "{}/paper/{}/citations", BASE_URL, paper_id)?; write!(&mut url, "{}/paper/{}/citations", BASE_URL, paper_id)?;
let resp = reqwest::get(url) let mut req = client.get(url);
.await? let api_key = std::env::var("SEMANTIC_SCHOLAR_API_KEY");
.text() if api_key.is_ok() {
.await?; req = req.header("x-api-key", api_key.unwrap());
}
let resp = req.send().await?.text().await?;
let resp_deserialized_attempt = serde_json::from_str::<ApiListResponse<Citation>>(resp.as_str()); let resp_deserialized_attempt =
serde_json::from_str::<ApiListResponse<Citation>>(resp.as_str());
if let Err(err) = resp_deserialized_attempt { if let Err(err) = resp_deserialized_attempt {
println!("depth {} paper {} error {}", depth, paper_id, err); println!("depth {} paper {} error {}", depth, paper_id, err);
@ -95,53 +104,50 @@ async fn get_citations(paper_id: String, depth: u32, authors: &mut Vec<Author>)
let resp_deserialized: ApiListResponse<Citation> = resp_deserialized_attempt.unwrap(); let resp_deserialized: ApiListResponse<Citation> = resp_deserialized_attempt.unwrap();
for Citation{citing_paper: CitingPaper{paper_id: citing_paper_id, title}} in resp_deserialized.data { if resp_deserialized.message.is_some() {
println!(
"depth {} paper {} error {}",
depth,
paper_id,
resp_deserialized.message.unwrap()
);
return Ok(vec![]);
}
for Citation {
citing_paper:
CitingPaper {
paper_id: citing_paper_id,
title,
},
} in resp_deserialized.data.unwrap()
{
if let (Some(citing_paper_id), Some(title)) = (citing_paper_id, title) { if let (Some(citing_paper_id), Some(title)) = (citing_paper_id, title) {
let short_len = min(50, title.len()); let short_len = min(50, title.len());
let (short_title, _) = title.split_at(short_len); let (short_title, _) = title.split_at(short_len);
println!("depth {} paper {} cites {} title {}", depth, citing_paper_id, paper_id, short_title); println!(
"depth {} paper {} cites {} title {}",
depth, citing_paper_id, paper_id, short_title
);
get_citations(citing_paper_id, depth + 1, authors).await?; get_citations(&client, citing_paper_id, depth + 1, authors).await?;
} }
} }
Ok(vec![]) Ok(vec![])
} }
async fn get_paper_info(paper_id: String, depth: u32, authors: &mut Authors) -> DataResult<Vec<Paper>> {
// Build the URL
let mut url = String::new();
// Probably also want: year,publicationDate,journal", BASE_URL, paper_id)?;
const fields: &str = "title, authors, citations";
write!(&mut url, "{}/paper/{}?fields={}", BASE_URL, paper_id, fields)?;
let resp = reqwest::get(url)
.await?
.text()
.await?;
let resp_deserialized_attempt = serde_json::from_str::<ApiListResponse<Paper>>(resp.as_str());
if let Err(err) = resp_deserialized_attempt {
println!("depth {} paper {} error {}", depth, paper_id, err);
return Ok(vec![]);
}
let resp_deserialized: ApiListResponse<Paper> = resp_deserialized_attempt.unwrap();
Ok(vec![])
}
#[tokio::main] #[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> { async fn main() -> Result<(), Box<dyn Error>> {
let Args{ let Args { depth, paper_id } = Args::parse();
depth,
paper_id, dotenv().ok();
// write_to_mongo,
} = Args::parse();
let mut authors = Authors::new(); let mut authors = Authors::new();
get_citations(paper_id, depth, &mut authors).await?; let client: reqwest::Client = reqwest::Client::new();
get_citations(&client, paper_id, depth, &mut authors).await?;
Ok(()) Ok(())
} }