Skip to content

Commit

Permalink
Make built-in adapters' identifiers configurable
Browse files Browse the repository at this point in the history
This will allow end users to provide their own lists of extensions and/or
mimetypes for each of the built-in adapters.
  • Loading branch information
lafrenierejm committed Sep 4, 2024
1 parent ebd32ab commit 24f57ef
Show file tree
Hide file tree
Showing 15 changed files with 760 additions and 298 deletions.
2 changes: 2 additions & 0 deletions .envrc
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
use flake

PATH_add "$(git rev-parse --show-toplevel)/result/bin"
205 changes: 181 additions & 24 deletions src/adapters.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use crate::{adapted_iter::AdaptedFilesIterBox, config::RgaConfig, matching::*};
use anyhow::{format_err, Context, Result};
use async_trait::async_trait;
use custom::CustomAdapterConfig;
use custom::CustomIdentifiers;
use custom::BUILTIN_SPAWNING_ADAPTERS;
use log::*;
use tokio::io::AsyncRead;
Expand All @@ -35,7 +36,7 @@ pub struct AdapterMeta {
/// indicates whether this adapter can descend (=call rga_preproc again). if true, the cache key needs to include the list of active adapters
pub recurses: bool,
/// list of matchers (interpreted as a OR b OR ...)
pub fast_matchers: Vec<FastFileMatcher>,
pub fast_matchers: Option<Vec<FastFileMatcher>>,
/// list of matchers when we have mime type detection active (interpreted as ORed)
/// warning: this *overrides* the fast matchers
pub slow_matchers: Option<Vec<FileMatcher>>,
Expand All @@ -48,39 +49,65 @@ pub struct AdapterMeta {
}
impl AdapterMeta {
// todo: this is pretty ugly
pub fn get_matchers<'a>(
&'a self,
slow: bool,
) -> Box<dyn Iterator<Item = Cow<FileMatcher>> + 'a> {
pub fn get_matchers(&self, slow: bool) -> Box<dyn Iterator<Item = Cow<FileMatcher>> + '_> {
match (
slow,
self.keep_fast_matchers_if_accurate,
&self.slow_matchers,
&self.fast_matchers,
) {
(true, false, Some(ref sm)) => Box::new(sm.iter().map(Cow::Borrowed)),
(true, true, Some(ref sm)) => Box::new(
(true, false, Some(ref sm), _) => Box::new(sm.iter().map(Cow::Borrowed)),
(true, true, Some(ref sm), Some(ref fm)) => Box::new(
sm.iter().map(Cow::Borrowed).chain(
self.fast_matchers
.iter()
.map(|e| Cow::Owned(FileMatcher::Fast(e.clone()))),
fm.iter()
.map(|e| Cow::Owned(FileMatcher::Fast(e.clone())))
.collect::<Vec<_>>(),
),
),
// don't have slow matchers or slow matching disabled
(true, _, None) | (false, _, _) => Box::new(
self.fast_matchers
.iter()
.map(|e| Cow::Owned(FileMatcher::Fast(e.clone()))),
),
(true, _, None, Some(ref fm)) | (false, _, _, Some(ref fm)) => {
Box::new(fm.iter().map(|e| Cow::Owned(FileMatcher::Fast(e.clone()))))
}
_ => Box::new(::std::iter::empty()),
}
}
}

pub trait GetMetadata {
fn metadata(&self) -> &AdapterMeta;
pub trait Adapter {
fn name(&self) -> String;
fn version(&self) -> i32;
fn description(&self) -> String;
fn recurses(&self) -> bool;
fn disabled_by_default(&self) -> bool;
fn keep_fast_matchers_if_accurate(&self) -> bool;
fn extensions(&self) -> Option<Vec<String>>;
fn mimetypes(&self) -> Option<Vec<String>>;

fn metadata(&self) -> AdapterMeta {
return AdapterMeta {
name: self.name(),
version: self.version(),
description: self.description(),
recurses: true,
fast_matchers: self.extensions().map(|exts| {
exts.iter()
.map(|s| FastFileMatcher::FileExtension(s.to_string()))
.collect()
}),
slow_matchers: self.mimetypes().map(|mimetypes| {
mimetypes
.iter()
.map(|mimetype| FileMatcher::MimeType(mimetype.to_string()))
.collect()
}),
disabled_by_default: self.disabled_by_default(),
keep_fast_matchers_if_accurate: self.keep_fast_matchers_if_accurate(),
};
}
}

#[async_trait]
pub trait FileAdapter: GetMetadata + Send + Sync {
pub trait FileAdapter: Adapter + Send + Sync {
/// adapt a file.
///
/// detection_reason is the Matcher that was used to identify this file. Unless --rga-accurate was given, it is always a FastMatcher
Expand Down Expand Up @@ -109,7 +136,10 @@ pub struct AdaptInfo {
/// (enabledAdapters, disabledAdapters)
type AdaptersTuple = (Vec<Arc<dyn FileAdapter>>, Vec<Arc<dyn FileAdapter>>);

pub fn get_all_adapters(custom_adapters: Option<Vec<CustomAdapterConfig>>) -> AdaptersTuple {
pub fn get_all_adapters(
custom_identifiers: Option<CustomIdentifiers>,
custom_adapters: Option<Vec<CustomAdapterConfig>>,
) -> AdaptersTuple {
// order in descending priority
let mut adapters: Vec<Arc<dyn FileAdapter>> = vec![];
if let Some(custom_adapters) = custom_adapters {
Expand All @@ -118,12 +148,137 @@ pub fn get_all_adapters(custom_adapters: Option<Vec<CustomAdapterConfig>>) -> Ad
}
}

let custom_identifiers = custom_identifiers.unwrap_or_default();
let internal_adapters: Vec<Arc<dyn FileAdapter>> = vec![
Arc::new(PostprocPageBreaks::default()),
Arc::new(ffmpeg::FFmpegAdapter::new()),
Arc::new(zip::ZipAdapter::new()),
Arc::new(decompress::DecompressAdapter::new()),
Arc::new(mbox::MboxAdapter::new()),
Arc::new(ffmpeg::FFmpegAdapter {
extensions: custom_identifiers
.ffmpeg
.extensions
.clone()
.unwrap_or_else(|| {
ffmpeg::EXTENSIONS
.iter()
.map(|&s| s.to_string())
.collect::<Vec<String>>()
}),
mimetypes: custom_identifiers
.ffmpeg
.mimetypes
.clone()
.unwrap_or_else(|| {
ffmpeg::MIMETYPES
.iter()
.map(|&s| s.to_string())
.collect::<Vec<String>>()
}),
}),
Arc::new(zip::ZipAdapter {
extensions: custom_identifiers
.zip
.extensions
.clone()
.unwrap_or_else(|| {
zip::EXTENSIONS
.iter()
.map(|&s| s.to_string())
.collect::<Vec<String>>()
}),
mimetypes: custom_identifiers.zip.mimetypes.clone().unwrap_or_else(|| {
zip::MIMETYPES
.iter()
.map(|&s| s.to_string())
.collect::<Vec<String>>()
}),
}),
Arc::new(decompress::DecompressAdapter {
extensions_gz: custom_identifiers.gz.extensions.clone().unwrap_or_else(|| {
decompress::EXTENSIONS_GZ
.iter()
.map(|&s| s.to_string())
.collect::<Vec<String>>()
}),
extensions_bz2: custom_identifiers
.bz2
.extensions
.clone()
.unwrap_or_else(|| {
decompress::EXTENSIONS_BZ2
.iter()
.map(|&s| s.to_string())
.collect::<Vec<String>>()
}),
extensions_xz: custom_identifiers.xz.extensions.clone().unwrap_or_else(|| {
decompress::EXTENSIONS_XZ
.iter()
.map(|&s| s.to_string())
.collect::<Vec<String>>()
}),
extensions_zst: custom_identifiers
.zst
.extensions
.clone()
.unwrap_or_else(|| {
decompress::EXTENSIONS_ZST
.iter()
.map(|&s| s.to_string())
.collect::<Vec<String>>()
}),
mimetypes_gz: custom_identifiers.gz.extensions.clone().unwrap_or_else(|| {
decompress::MIMETYPES_GZ
.iter()
.map(|&s| s.to_string())
.collect::<Vec<String>>()
}),
mimetypes_bz2: custom_identifiers
.bz2
.extensions
.clone()
.unwrap_or_else(|| {
decompress::MIMETYPES_BZ2
.iter()
.map(|&s| s.to_string())
.collect::<Vec<String>>()
}),
mimetypes_xz: custom_identifiers.xz.extensions.clone().unwrap_or_else(|| {
decompress::MIMETYPES_XZ
.iter()
.map(|&s| s.to_string())
.collect::<Vec<String>>()
}),
mimetypes_zst: custom_identifiers
.zst
.extensions
.clone()
.unwrap_or_else(|| {
decompress::MIMETYPES_ZST
.iter()
.map(|&s| s.to_string())
.collect::<Vec<String>>()
}),
}),
Arc::new(mbox::MboxAdapter {
extensions: custom_identifiers
.mbox
.extensions
.clone()
.unwrap_or_else(|| {
mbox::EXTENSIONS
.iter()
.map(|&s| s.to_string())
.collect::<Vec<String>>()
}),
mimetypes: custom_identifiers
.mbox
.mimetypes
.clone()
.unwrap_or_else(|| {
mbox::MIMETYPES
.iter()
.map(|&s| s.to_string())
.collect::<Vec<String>>()
}),
}),
Arc::new(tar::TarAdapter::new()),
Arc::new(sqlite::SqliteAdapter::new()),
];
Expand All @@ -148,10 +303,12 @@ pub fn get_all_adapters(custom_adapters: Option<Vec<CustomAdapterConfig>>) -> Ad
* - "+a,b" means use default list but also a and b (a,b will be prepended to the list so given higher priority)
*/
pub fn get_adapters_filtered<T: AsRef<str>>(
custom_identifiers: Option<CustomIdentifiers>,
custom_adapters: Option<Vec<CustomAdapterConfig>>,
adapter_names: &[T],
) -> Result<Vec<Arc<dyn FileAdapter>>> {
let (def_enabled_adapters, def_disabled_adapters) = get_all_adapters(custom_adapters);
let (def_enabled_adapters, def_disabled_adapters) =
get_all_adapters(custom_identifiers, custom_adapters);
let adapters = if !adapter_names.is_empty() {
let adapters_map: HashMap<_, _> = def_enabled_adapters
.iter()
Expand Down
Loading

0 comments on commit 24f57ef

Please sign in to comment.