Compare commits
13 Commits
Author | SHA1 | Date |
---|---|---|
Ladd Hoffman | 24c183912a | |
Ladd Hoffman | ad382b5caf | |
Ladd Hoffman | c80f2ee79b | |
Ladd Hoffman | 846eb73cea | |
Ladd Hoffman | 1f3d8a7d1e | |
Ladd Hoffman | 4d53f5c70e | |
Ladd Hoffman | ae5ab09e16 | |
Ladd Hoffman | 82e026f327 | |
Ladd Hoffman | 8bb188ff13 | |
Ladd Hoffman | ce4f78aa97 | |
Ladd Hoffman | 68d04117c9 | |
Ladd Hoffman | ff7d6134f1 | |
Ladd Hoffman | 43462e84ea |
|
@ -4,4 +4,4 @@
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --- | --- |
|
| --- | --- |
|
||||||
| [forum-logic](https://gitlab.com/dao-governance-framework/forum-logic) | Javascript prototyping forum architecture |
|
| [semantic-scholar-client](./semantic-scholar-client) | Rust utility for reading data from the [Semantic Scholar API](https://api.semanticscholar.org/api-docs) |
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
SEMANTIC_SCHOLAR_API_KEY=
|
|
@ -0,0 +1,2 @@
|
||||||
|
/target
|
||||||
|
.env
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,17 @@
|
||||||
|
[package]
|
||||||
|
name = "semantic-scholar-client"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
default-run = "import"
|
||||||
|
|
||||||
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
async-recursion = "1.0.0"
|
||||||
|
clap = { version = "3.2.11", features = ["derive"] }
|
||||||
|
dotenv = "0.15.0"
|
||||||
|
mongodb = "2.2.2"
|
||||||
|
reqwest = { version = "0.11.11", features = ["json"] }
|
||||||
|
serde = { version = "1.0.139", features = ["derive"] }
|
||||||
|
serde_json = "1.0.82"
|
||||||
|
tokio = { version = "1.20.0", features = ["full"] }
|
|
@ -0,0 +1,25 @@
|
||||||
|
#`semantic-scholar-client`
|
||||||
|
|
||||||
|
This utility is able to fetch data from Semantic Scholar API.
|
||||||
|
|
||||||
|
Initial proof of concept here writes the result to stdout.
|
||||||
|
|
||||||
|
Work in progress to pipe this data into an operating database.
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
* (Optional) Copy `.env.example` to `.env` and set the value of `SEMANTIC_SCHOLAR_API_KEY`
|
||||||
|
* Run the program
|
||||||
|
|
||||||
|
cargo run -- --paper-id <paper_id> --depth <depth>
|
||||||
|
|
||||||
|
* `paper_id` values are in accordance with [Semantic Scholar API](https://api.semanticscholar.org/api-docs/).
|
||||||
|
* `depth` is the number of citations to traverse, from the starting paper.
|
||||||
|
|
||||||
|
### Notes
|
||||||
|
|
||||||
|
Ideas for followup work:
|
||||||
|
- Consider strategies for deciding where to terminate a given traversal
|
||||||
|
- Provide an HTTP/WebSocket interface that can be used to talk to this process during its operation.
|
||||||
|
This can enable us to pipe the data to other tasks, to monitor, to start/stop, and even to make configuration changes.
|
||||||
|
- Rate limit requests
|
|
@ -0,0 +1,153 @@
|
||||||
|
// During development, allowing dead code
|
||||||
|
#![allow(dead_code)]
|
||||||
|
|
||||||
|
use async_recursion::async_recursion;
|
||||||
|
use clap::Parser;
|
||||||
|
use dotenv::dotenv;
|
||||||
|
use serde::Deserialize;
|
||||||
|
use std::cmp::min;
|
||||||
|
use std::error::Error;
|
||||||
|
use std::fmt::Write;
|
||||||
|
|
||||||
|
type DataResult<T> = Result<T, Box<dyn Error>>;
|
||||||
|
|
||||||
|
const BASE_URL: &str = "https://api.semanticscholar.org/graph/v1";
|
||||||
|
const MAX_DEPTH: u32 = 3;
|
||||||
|
|
||||||
|
#[derive(Parser, Debug)]
|
||||||
|
#[clap(author, version, about, long_about = None)]
|
||||||
|
struct Args {
|
||||||
|
/// How deep to traverse citation graph from the starting paper
|
||||||
|
#[clap(short, long, value_parser)]
|
||||||
|
depth: u32,
|
||||||
|
|
||||||
|
/// Starting paper. We will traverse papers that cite this one
|
||||||
|
#[clap(short, long, value_parser)]
|
||||||
|
paper_id: String,
|
||||||
|
// Write the results to MongoDB
|
||||||
|
// #[clap(short, long, value_parser)]
|
||||||
|
// write_to_mongo: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Author {
|
||||||
|
name: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
type Authors = Vec<Author>;
|
||||||
|
|
||||||
|
#[derive(Deserialize, Debug)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
struct Paper {
|
||||||
|
paper_id: String,
|
||||||
|
title: Option<String>,
|
||||||
|
citations: Vec<Citation>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Occurs within Citation struct
|
||||||
|
*/
|
||||||
|
#[derive(Deserialize, Debug)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
struct CitingPaper {
|
||||||
|
paper_id: Option<String>,
|
||||||
|
title: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize, Debug)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
struct Citation {
|
||||||
|
citing_paper: CitingPaper,
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
code: Option<String>,
|
||||||
|
* Generic struct to wrap the common API response pattern {data: [...]}
|
||||||
|
*/
|
||||||
|
#[derive(Deserialize, Debug)]
|
||||||
|
struct ApiListResponse<T> {
|
||||||
|
data: Option<Vec<T>>,
|
||||||
|
message: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: Cache results in a (separate but local) database such as Redis
|
||||||
|
// TODO: Store results in a (separate but local) database such as Postgres
|
||||||
|
#[async_recursion]
|
||||||
|
async fn get_citations(
|
||||||
|
client: &reqwest::Client,
|
||||||
|
paper_id: String,
|
||||||
|
depth: u32,
|
||||||
|
authors: &mut Vec<Author>,
|
||||||
|
) -> DataResult<Vec<Citation>> {
|
||||||
|
// Bound recursion to some depth
|
||||||
|
if depth > MAX_DEPTH {
|
||||||
|
return Ok(vec![]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build the URL
|
||||||
|
let mut url = String::new();
|
||||||
|
write!(&mut url, "{}/paper/{}/citations", BASE_URL, paper_id)?;
|
||||||
|
|
||||||
|
let mut req = client.get(url);
|
||||||
|
let api_key = std::env::var("SEMANTIC_SCHOLAR_API_KEY");
|
||||||
|
if api_key.is_ok() {
|
||||||
|
req = req.header("x-api-key", api_key.unwrap());
|
||||||
|
}
|
||||||
|
let resp = req.send().await?.text().await?;
|
||||||
|
|
||||||
|
let resp_deserialized_attempt =
|
||||||
|
serde_json::from_str::<ApiListResponse<Citation>>(resp.as_str());
|
||||||
|
|
||||||
|
if let Err(err) = resp_deserialized_attempt {
|
||||||
|
println!("depth {} paper {} error {}", depth, paper_id, err);
|
||||||
|
return Ok(vec![]);
|
||||||
|
}
|
||||||
|
|
||||||
|
let resp_deserialized: ApiListResponse<Citation> = resp_deserialized_attempt.unwrap();
|
||||||
|
|
||||||
|
if resp_deserialized.message.is_some() {
|
||||||
|
println!(
|
||||||
|
"depth {} paper {} error {}",
|
||||||
|
depth,
|
||||||
|
paper_id,
|
||||||
|
resp_deserialized.message.unwrap()
|
||||||
|
);
|
||||||
|
return Ok(vec![]);
|
||||||
|
}
|
||||||
|
|
||||||
|
for Citation {
|
||||||
|
citing_paper:
|
||||||
|
CitingPaper {
|
||||||
|
paper_id: citing_paper_id,
|
||||||
|
title,
|
||||||
|
},
|
||||||
|
} in resp_deserialized.data.unwrap()
|
||||||
|
{
|
||||||
|
if let (Some(citing_paper_id), Some(title)) = (citing_paper_id, title) {
|
||||||
|
let short_len = min(50, title.len());
|
||||||
|
let (short_title, _) = title.split_at(short_len);
|
||||||
|
println!(
|
||||||
|
"depth {} paper {} cites {} title {}",
|
||||||
|
depth, citing_paper_id, paper_id, short_title
|
||||||
|
);
|
||||||
|
|
||||||
|
get_citations(&client, citing_paper_id, depth + 1, authors).await?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(vec![])
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> Result<(), Box<dyn Error>> {
|
||||||
|
let Args { depth, paper_id } = Args::parse();
|
||||||
|
|
||||||
|
dotenv().ok();
|
||||||
|
|
||||||
|
let mut authors = Authors::new();
|
||||||
|
|
||||||
|
let client: reqwest::Client = reqwest::Client::new();
|
||||||
|
|
||||||
|
get_citations(&client, paper_id, depth, &mut authors).await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
|
@ -0,0 +1,59 @@
|
||||||
|
|
||||||
|
use mongodb::{Client, options::ClientOptions};
|
||||||
|
|
||||||
|
const MONGO_DB_ADDRESS: &str = "mongodb://docker:mongopw@localhost:55000";
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||||
|
// Parse a connection string into an options struct.
|
||||||
|
let client_options = ClientOptions::parse(MONGO_DB_ADDRESS).await?;
|
||||||
|
|
||||||
|
// Get a handle to the deployment.
|
||||||
|
let client = Client::with_options(client_options)?;
|
||||||
|
|
||||||
|
// Try creating a collection
|
||||||
|
{
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
|
struct Book {
|
||||||
|
title: String,
|
||||||
|
author: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reference a (new) database
|
||||||
|
let db = client.database("db2");
|
||||||
|
|
||||||
|
// Get a handle to a collection of `Book`.
|
||||||
|
let typed_collection = db.collection::<Book>("books");
|
||||||
|
|
||||||
|
let books = vec![
|
||||||
|
Book {
|
||||||
|
title: "The Grapes of Wrath".to_string(),
|
||||||
|
author: "John Steinbeck".to_string(),
|
||||||
|
},
|
||||||
|
Book {
|
||||||
|
title: "To Kill a Mockingbird".to_string(),
|
||||||
|
author: "Harper Lee".to_string(),
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
// Insert the books into "mydb.books" collection, no manual conversion to BSON necessary.
|
||||||
|
typed_collection.insert_many(books, None).await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// List the names of the databases in that deployment.
|
||||||
|
for db_name in client.list_database_names(None, None).await? {
|
||||||
|
println!("{}", db_name);
|
||||||
|
// Get a handle to a database.
|
||||||
|
let db = client.database(db_name.as_str());
|
||||||
|
|
||||||
|
// List the names of the collections in that database.
|
||||||
|
for collection_name in db.list_collection_names(None).await? {
|
||||||
|
println!("- {}", collection_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
Loading…
Reference in New Issue