diff --git a/semantic-scholar-client/.env.example b/semantic-scholar-client/.env.example new file mode 100644 index 0000000..059442f --- /dev/null +++ b/semantic-scholar-client/.env.example @@ -0,0 +1 @@ +SEMANTIC_SCHOLAR_API_KEY= \ No newline at end of file diff --git a/semantic-scholar-client/.gitignore b/semantic-scholar-client/.gitignore index ea8c4bf..fedaa2b 100644 --- a/semantic-scholar-client/.gitignore +++ b/semantic-scholar-client/.gitignore @@ -1 +1,2 @@ /target +.env diff --git a/semantic-scholar-client/Cargo.lock b/semantic-scholar-client/Cargo.lock index f2cf698..e41ebf6 100644 --- a/semantic-scholar-client/Cargo.lock +++ b/semantic-scholar-client/Cargo.lock @@ -266,6 +266,12 @@ dependencies = [ "subtle", ] +[[package]] +name = "dotenv" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77c90badedccf4105eca100756a0b1289e191f6fcbdadd3cee1d2f614f97da8f" + [[package]] name = "encoding_rs" version = "0.8.31" @@ -1206,6 +1212,7 @@ version = "0.1.0" dependencies = [ "async-recursion", "clap", + "dotenv", "mongodb", "reqwest", "serde", diff --git a/semantic-scholar-client/Cargo.toml b/semantic-scholar-client/Cargo.toml index baaf309..1102ca1 100644 --- a/semantic-scholar-client/Cargo.toml +++ b/semantic-scholar-client/Cargo.toml @@ -2,12 +2,14 @@ name = "semantic-scholar-client" version = "0.1.0" edition = "2021" +default-run = "import" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] async-recursion = "1.0.0" clap = { version = "3.2.11", features = ["derive"] } +dotenv = "0.15.0" mongodb = "2.2.2" reqwest = { version = "0.11.11", features = ["json"] } serde = { version = "1.0.139", features = ["derive"] } diff --git a/semantic-scholar-client/README.md b/semantic-scholar-client/README.md index b8c6b3f..8e1ab37 100644 --- a/semantic-scholar-client/README.md +++ b/semantic-scholar-client/README.md @@ -8,6 +8,9 @@ Work in progress to pipe this data into an operating database. ### Usage +* (Optional) Copy `.env.example` to `.env` and set the value of `SEMANTIC_SCHOLAR_API_KEY` +* Run the program + cargo run -- --paper-id --depth * `paper_id` values are in accordance with [Semantic Scholar API](https://api.semanticscholar.org/api-docs/). @@ -19,3 +22,4 @@ Ideas for followup work: - Consider strategies for deciding where to terminate a given traversal - Provide an HTTP/WebSocket interface that can be used to talk to this process during its operation. This can enable us to pipe the data to other tasks, to monitor, to start/stop, and even to make configuration changes. +- Rate limit requests \ No newline at end of file diff --git a/semantic-scholar-client/src/bin/import.rs b/semantic-scholar-client/src/bin/import.rs index d741bfe..7857b4f 100644 --- a/semantic-scholar-client/src/bin/import.rs +++ b/semantic-scholar-client/src/bin/import.rs @@ -3,10 +3,11 @@ use async_recursion::async_recursion; use clap::Parser; -use std::cmp::min; -use std::fmt::Write; -use std::error::Error; +use dotenv::dotenv; use serde::Deserialize; +use std::cmp::min; +use std::error::Error; +use std::fmt::Write; type DataResult = Result>; @@ -23,14 +24,13 @@ struct Args { /// Starting paper. We will traverse papers that cite this one #[clap(short, long, value_parser)] paper_id: String, - // Write the results to MongoDB // #[clap(short, long, value_parser)] // write_to_mongo: bool, } struct Author { - name: String + name: String, } type Authors = Vec; @@ -56,22 +56,28 @@ struct CitingPaper { #[derive(Deserialize, Debug)] #[serde(rename_all = "camelCase")] struct Citation { - citing_paper: CitingPaper + citing_paper: CitingPaper, } /** - * Generic struct to wrap the common API response pattern {data: [...]} - */ + code: Option, +* Generic struct to wrap the common API response pattern {data: [...]} +*/ #[derive(Deserialize, Debug)] struct ApiListResponse { - data: Vec + data: Option>, + message: Option, } - // TODO: Cache results in a (separate but local) database such as Redis // TODO: Store results in a (separate but local) database such as Postgres #[async_recursion] -async fn get_citations(paper_id: String, depth: u32, authors: &mut Vec) -> DataResult> { +async fn get_citations( + client: &reqwest::Client, + paper_id: String, + depth: u32, + authors: &mut Vec, +) -> DataResult> { // Bound recursion to some depth if depth > MAX_DEPTH { return Ok(vec![]); @@ -81,12 +87,15 @@ async fn get_citations(paper_id: String, depth: u32, authors: &mut Vec) let mut url = String::new(); write!(&mut url, "{}/paper/{}/citations", BASE_URL, paper_id)?; - let resp = reqwest::get(url) - .await? - .text() - .await?; + let mut req = client.get(url); + let api_key = std::env::var("SEMANTIC_SCHOLAR_API_KEY"); + if api_key.is_ok() { + req = req.header("x-api-key", api_key.unwrap()); + } + let resp = req.send().await?.text().await?; - let resp_deserialized_attempt = serde_json::from_str::>(resp.as_str()); + let resp_deserialized_attempt = + serde_json::from_str::>(resp.as_str()); if let Err(err) = resp_deserialized_attempt { println!("depth {} paper {} error {}", depth, paper_id, err); @@ -95,53 +104,50 @@ async fn get_citations(paper_id: String, depth: u32, authors: &mut Vec) let resp_deserialized: ApiListResponse = resp_deserialized_attempt.unwrap(); - for Citation{citing_paper: CitingPaper{paper_id: citing_paper_id, title}} in resp_deserialized.data { + if resp_deserialized.message.is_some() { + println!( + "depth {} paper {} error {}", + depth, + paper_id, + resp_deserialized.message.unwrap() + ); + return Ok(vec![]); + } + + for Citation { + citing_paper: + CitingPaper { + paper_id: citing_paper_id, + title, + }, + } in resp_deserialized.data.unwrap() + { if let (Some(citing_paper_id), Some(title)) = (citing_paper_id, title) { let short_len = min(50, title.len()); let (short_title, _) = title.split_at(short_len); - println!("depth {} paper {} cites {} title {}", depth, citing_paper_id, paper_id, short_title); + println!( + "depth {} paper {} cites {} title {}", + depth, citing_paper_id, paper_id, short_title + ); - get_citations(citing_paper_id, depth + 1, authors).await?; + get_citations(&client, citing_paper_id, depth + 1, authors).await?; } } Ok(vec![]) } -async fn get_paper_info(paper_id: String, depth: u32, authors: &mut Authors) -> DataResult> { - // Build the URL - let mut url = String::new(); - // Probably also want: year,publicationDate,journal", BASE_URL, paper_id)?; - const fields: &str = "title, authors, citations"; - write!(&mut url, "{}/paper/{}?fields={}", BASE_URL, paper_id, fields)?; - - let resp = reqwest::get(url) - .await? - .text() - .await?; - - let resp_deserialized_attempt = serde_json::from_str::>(resp.as_str()); - - if let Err(err) = resp_deserialized_attempt { - println!("depth {} paper {} error {}", depth, paper_id, err); - return Ok(vec![]); - } - - let resp_deserialized: ApiListResponse = resp_deserialized_attempt.unwrap(); - Ok(vec![]) -} - #[tokio::main] async fn main() -> Result<(), Box> { - let Args{ - depth, - paper_id, - // write_to_mongo, - } = Args::parse(); + let Args { depth, paper_id } = Args::parse(); + + dotenv().ok(); let mut authors = Authors::new(); - get_citations(paper_id, depth, &mut authors).await?; + let client: reqwest::Client = reqwest::Client::new(); + + get_citations(&client, paper_id, depth, &mut authors).await?; Ok(()) -} \ No newline at end of file +}