use std::cell::RefCell;
use std::collections::HashMap;
use std::panic::AssertUnwindSafe;
use std::sync::Arc;
#[cfg(feature = "perf-literal")]
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
use syntax::hir::literal::Literals;
use syntax::hir::Hir;
use syntax::ParserBuilder;
use backtrack;
use compile::Compiler;
#[cfg(feature = "perf-dfa")]
use dfa;
use error::Error;
use input::{ByteInput, CharInput};
use literal::LiteralSearcher;
use pikevm;
use pool::{Pool, PoolGuard};
use prog::Program;
use re_builder::RegexOptions;
use re_bytes;
use re_set;
use re_trait::{Locations, RegularExpression, Slot};
use re_unicode;
use utf8::next_utf8;
#[derive(Debug)]
pub struct Exec {
ro: Arc<ExecReadOnly>,
pool: Pool<ProgramCache>,
}
#[derive(Debug)]
pub struct ExecNoSync<'c> {
ro: &'c Arc<ExecReadOnly>,
cache: PoolGuard<'c, ProgramCache>,
}
#[derive(Debug)]
pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
#[derive(Debug)]
struct ExecReadOnly {
res: Vec<String>,
nfa: Program,
dfa: Program,
dfa_reverse: Program,
suffixes: LiteralSearcher,
#[cfg(feature = "perf-literal")]
ac: Option<AhoCorasick<u32>>,
match_type: MatchType,
}
#[allow(missing_debug_implementations)]
pub struct ExecBuilder {
options: RegexOptions,
match_type: Option<MatchType>,
bytes: bool,
only_utf8: bool,
}
struct Parsed {
exprs: Vec<Hir>,
prefixes: Literals,
suffixes: Literals,
bytes: bool,
}
impl ExecBuilder {
pub fn new(re: &str) -> Self {
Self::new_many(&[re])
}
pub fn new_many<I, S>(res: I) -> Self
where
S: AsRef<str>,
I: IntoIterator<Item = S>,
{
let mut opts = RegexOptions::default();
opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
Self::new_options(opts)
}
pub fn new_options(opts: RegexOptions) -> Self {
ExecBuilder {
options: opts,
match_type: None,
bytes: false,
only_utf8: true,
}
}
pub fn automatic(mut self) -> Self {
self.match_type = None;
self
}
pub fn nfa(mut self) -> Self {
self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
self
}
pub fn bounded_backtracking(mut self) -> Self {
self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
self
}
pub fn bytes(mut self, yes: bool) -> Self {
self.bytes = yes;
self
}
pub fn only_utf8(mut self, yes: bool) -> Self {
self.only_utf8 = yes;
self
}
pub fn unicode(mut self, yes: bool) -> Self {
self.options.unicode = yes;
self
}
fn parse(&self) -> Result<Parsed, Error> {
let mut exprs = Vec::with_capacity(self.options.pats.len());
let mut prefixes = Some(Literals::empty());
let mut suffixes = Some(Literals::empty());
let mut bytes = false;
let is_set = self.options.pats.len() > 1;
for pat in &self.options.pats {
let mut parser = ParserBuilder::new()
.octal(self.options.octal)
.case_insensitive(self.options.case_insensitive)
.multi_line(self.options.multi_line)
.dot_matches_new_line(self.options.dot_matches_new_line)
.swap_greed(self.options.swap_greed)
.ignore_whitespace(self.options.ignore_whitespace)
.unicode(self.options.unicode)
.allow_invalid_utf8(!self.only_utf8)
.nest_limit(self.options.nest_limit)
.build();
let expr =
parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?;
bytes = bytes || !expr.is_always_utf8();
if cfg!(feature = "perf-literal") {
if !expr.is_anchored_start() && expr.is_any_anchored_start() {
prefixes = None;
} else if is_set && expr.is_anchored_start() {
prefixes = None;
}
prefixes = prefixes.and_then(|mut prefixes| {
if !prefixes.union_prefixes(&expr) {
None
} else {
Some(prefixes)
}
});
if !expr.is_anchored_end() && expr.is_any_anchored_end() {
suffixes = None;
} else if is_set && expr.is_anchored_end() {
suffixes = None;
}
suffixes = suffixes.and_then(|mut suffixes| {
if !suffixes.union_suffixes(&expr) {
None
} else {
Some(suffixes)
}
});
}
exprs.push(expr);
}
Ok(Parsed {
exprs: exprs,
prefixes: prefixes.unwrap_or_else(Literals::empty),
suffixes: suffixes.unwrap_or_else(Literals::empty),
bytes: bytes,
})
}
pub fn build(self) -> Result<Exec, Error> {
if self.options.pats.is_empty() {
let ro = Arc::new(ExecReadOnly {
res: vec![],
nfa: Program::new(),
dfa: Program::new(),
dfa_reverse: Program::new(),
suffixes: LiteralSearcher::empty(),
#[cfg(feature = "perf-literal")]
ac: None,
match_type: MatchType::Nothing,
});
let pool = ExecReadOnly::new_pool(&ro);
return Ok(Exec { ro: ro, pool });
}
let parsed = self.parse()?;
let mut nfa = Compiler::new()
.size_limit(self.options.size_limit)
.bytes(self.bytes || parsed.bytes)
.only_utf8(self.only_utf8)
.compile(&parsed.exprs)?;
let mut dfa = Compiler::new()
.size_limit(self.options.size_limit)
.dfa(true)
.only_utf8(self.only_utf8)
.compile(&parsed.exprs)?;
let mut dfa_reverse = Compiler::new()
.size_limit(self.options.size_limit)
.dfa(true)
.only_utf8(self.only_utf8)
.reverse(true)
.compile(&parsed.exprs)?;
#[cfg(feature = "perf-literal")]
let ac = self.build_aho_corasick(&parsed);
nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
dfa.prefixes = nfa.prefixes.clone();
dfa.dfa_size_limit = self.options.dfa_size_limit;
dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
let mut ro = ExecReadOnly {
res: self.options.pats,
nfa: nfa,
dfa: dfa,
dfa_reverse: dfa_reverse,
suffixes: LiteralSearcher::suffixes(parsed.suffixes),
#[cfg(feature = "perf-literal")]
ac: ac,
match_type: MatchType::Nothing,
};
ro.match_type = ro.choose_match_type(self.match_type);
let ro = Arc::new(ro);
let pool = ExecReadOnly::new_pool(&ro);
Ok(Exec { ro, pool })
}
#[cfg(feature = "perf-literal")]
fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>> {
if parsed.exprs.len() != 1 {
return None;
}
let lits = match alternation_literals(&parsed.exprs[0]) {
None => return None,
Some(lits) => lits,
};
if lits.len() <= 32 {
return None;
}
Some(
AhoCorasickBuilder::new()
.match_kind(MatchKind::LeftmostFirst)
.auto_configure(&lits)
.byte_classes(true)
.build_with_size::<u32, _, _>(&lits)
.expect("AC automaton too big"),
)
}
}
impl<'c> RegularExpression for ExecNoSyncStr<'c> {
type Text = str;
fn slots_len(&self) -> usize {
self.0.slots_len()
}
fn next_after_empty(&self, text: &str, i: usize) -> usize {
next_utf8(text.as_bytes(), i)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
self.0.shortest_match_at(text.as_bytes(), start)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn is_match_at(&self, text: &str, start: usize) -> bool {
self.0.is_match_at(text.as_bytes(), start)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
self.0.find_at(text.as_bytes(), start)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn captures_read_at(
&self,
locs: &mut Locations,
text: &str,
start: usize,
) -> Option<(usize, usize)> {
self.0.captures_read_at(locs, text.as_bytes(), start)
}
}
impl<'c> RegularExpression for ExecNoSync<'c> {
type Text = [u8];
fn slots_len(&self) -> usize {
self.ro.nfa.captures.len() * 2
}
fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
i + 1
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
if !self.is_anchor_end_match(text) {
return None;
}
match self.ro.match_type {
#[cfg(feature = "perf-literal")]
MatchType::Literal(ty) => {
self.find_literals(ty, text, start).map(|(_, e)| e)
}
#[cfg(feature = "perf-dfa")]
MatchType::Dfa | MatchType::DfaMany => {
match self.shortest_dfa(text, start) {
dfa::Result::Match(end) => Some(end),
dfa::Result::NoMatch(_) => None,
dfa::Result::Quit => self.shortest_nfa(text, start),
}
}
#[cfg(feature = "perf-dfa")]
MatchType::DfaAnchoredReverse => {
match dfa::Fsm::reverse(
&self.ro.dfa_reverse,
self.cache.value(),
true,
&text[start..],
text.len(),
) {
dfa::Result::Match(_) => Some(text.len()),
dfa::Result::NoMatch(_) => None,
dfa::Result::Quit => self.shortest_nfa(text, start),
}
}
#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
MatchType::DfaSuffix => {
match self.shortest_dfa_reverse_suffix(text, start) {
dfa::Result::Match(e) => Some(e),
dfa::Result::NoMatch(_) => None,
dfa::Result::Quit => self.shortest_nfa(text, start),
}
}
MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
MatchType::Nothing => None,
}
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn is_match_at(&self, text: &[u8], start: usize) -> bool {
if !self.is_anchor_end_match(text) {
return false;
}
match self.ro.match_type {
#[cfg(feature = "perf-literal")]
MatchType::Literal(ty) => {
self.find_literals(ty, text, start).is_some()
}
#[cfg(feature = "perf-dfa")]
MatchType::Dfa | MatchType::DfaMany => {
match self.shortest_dfa(text, start) {
dfa::Result::Match(_) => true,
dfa::Result::NoMatch(_) => false,
dfa::Result::Quit => self.match_nfa(text, start),
}
}
#[cfg(feature = "perf-dfa")]
MatchType::DfaAnchoredReverse => {
match dfa::Fsm::reverse(
&self.ro.dfa_reverse,
self.cache.value(),
true,
&text[start..],
text.len(),
) {
dfa::Result::Match(_) => true,
dfa::Result::NoMatch(_) => false,
dfa::Result::Quit => self.match_nfa(text, start),
}
}
#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
MatchType::DfaSuffix => {
match self.shortest_dfa_reverse_suffix(text, start) {
dfa::Result::Match(_) => true,
dfa::Result::NoMatch(_) => false,
dfa::Result::Quit => self.match_nfa(text, start),
}
}
MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
MatchType::Nothing => false,
}
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
if !self.is_anchor_end_match(text) {
return None;
}
match self.ro.match_type {
#[cfg(feature = "perf-literal")]
MatchType::Literal(ty) => self.find_literals(ty, text, start),
#[cfg(feature = "perf-dfa")]
MatchType::Dfa => match self.find_dfa_forward(text, start) {
dfa::Result::Match((s, e)) => Some((s, e)),
dfa::Result::NoMatch(_) => None,
dfa::Result::Quit => {
self.find_nfa(MatchNfaType::Auto, text, start)
}
},
#[cfg(feature = "perf-dfa")]
MatchType::DfaAnchoredReverse => {
match self.find_dfa_anchored_reverse(text, start) {
dfa::Result::Match((s, e)) => Some((s, e)),
dfa::Result::NoMatch(_) => None,
dfa::Result::Quit => {
self.find_nfa(MatchNfaType::Auto, text, start)
}
}
}
#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
MatchType::DfaSuffix => {
match self.find_dfa_reverse_suffix(text, start) {
dfa::Result::Match((s, e)) => Some((s, e)),
dfa::Result::NoMatch(_) => None,
dfa::Result::Quit => {
self.find_nfa(MatchNfaType::Auto, text, start)
}
}
}
MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
MatchType::Nothing => None,
#[cfg(feature = "perf-dfa")]
MatchType::DfaMany => {
unreachable!("BUG: RegexSet cannot be used with find")
}
}
}
fn captures_read_at(
&self,
locs: &mut Locations,
text: &[u8],
start: usize,
) -> Option<(usize, usize)> {
let slots = locs.as_slots();
for slot in slots.iter_mut() {
*slot = None;
}
match slots.len() {
0 => return self.find_at(text, start),
2 => {
return self.find_at(text, start).map(|(s, e)| {
slots[0] = Some(s);
slots[1] = Some(e);
(s, e)
});
}
_ => {}
}
if !self.is_anchor_end_match(text) {
return None;
}
match self.ro.match_type {
#[cfg(feature = "perf-literal")]
MatchType::Literal(ty) => {
self.find_literals(ty, text, start).and_then(|(s, e)| {
self.captures_nfa_type(
MatchNfaType::Auto,
slots,
text,
s,
e,
)
})
}
#[cfg(feature = "perf-dfa")]
MatchType::Dfa => {
if self.ro.nfa.is_anchored_start {
self.captures_nfa(slots, text, start)
} else {
match self.find_dfa_forward(text, start) {
dfa::Result::Match((s, e)) => self.captures_nfa_type(
MatchNfaType::Auto,
slots,
text,
s,
e,
),
dfa::Result::NoMatch(_) => None,
dfa::Result::Quit => {
self.captures_nfa(slots, text, start)
}
}
}
}
#[cfg(feature = "perf-dfa")]
MatchType::DfaAnchoredReverse => {
match self.find_dfa_anchored_reverse(text, start) {
dfa::Result::Match((s, e)) => self.captures_nfa_type(
MatchNfaType::Auto,
slots,
text,
s,
e,
),
dfa::Result::NoMatch(_) => None,
dfa::Result::Quit => self.captures_nfa(slots, text, start),
}
}
#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
MatchType::DfaSuffix => {
match self.find_dfa_reverse_suffix(text, start) {
dfa::Result::Match((s, e)) => self.captures_nfa_type(
MatchNfaType::Auto,
slots,
text,
s,
e,
),
dfa::Result::NoMatch(_) => None,
dfa::Result::Quit => self.captures_nfa(slots, text, start),
}
}
MatchType::Nfa(ty) => {
self.captures_nfa_type(ty, slots, text, start, text.len())
}
MatchType::Nothing => None,
#[cfg(feature = "perf-dfa")]
MatchType::DfaMany => {
unreachable!("BUG: RegexSet cannot be used with captures")
}
}
}
}
impl<'c> ExecNoSync<'c> {
#[cfg(feature = "perf-literal")]
#[cfg_attr(feature = "perf-inline", inline(always))]
fn find_literals(
&self,
ty: MatchLiteralType,
text: &[u8],
start: usize,
) -> Option<(usize, usize)> {
use self::MatchLiteralType::*;
match ty {
Unanchored => {
let lits = &self.ro.nfa.prefixes;
lits.find(&text[start..]).map(|(s, e)| (start + s, start + e))
}
AnchoredStart => {
let lits = &self.ro.nfa.prefixes;
if start == 0 || !self.ro.nfa.is_anchored_start {
lits.find_start(&text[start..])
.map(|(s, e)| (start + s, start + e))
} else {
None
}
}
AnchoredEnd => {
let lits = &self.ro.suffixes;
lits.find_end(&text[start..])
.map(|(s, e)| (start + s, start + e))
}
AhoCorasick => self
.ro
.ac
.as_ref()
.unwrap()
.find(&text[start..])
.map(|m| (start + m.start(), start + m.end())),
}
}
#[cfg(feature = "perf-dfa")]
#[cfg_attr(feature = "perf-inline", inline(always))]
fn find_dfa_forward(
&self,
text: &[u8],
start: usize,
) -> dfa::Result<(usize, usize)> {
use dfa::Result::*;
let end = match dfa::Fsm::forward(
&self.ro.dfa,
self.cache.value(),
false,
text,
start,
) {
NoMatch(i) => return NoMatch(i),
Quit => return Quit,
Match(end) if start == end => return Match((start, start)),
Match(end) => end,
};
match dfa::Fsm::reverse(
&self.ro.dfa_reverse,
self.cache.value(),
false,
&text[start..],
end - start,
) {
Match(s) => Match((start + s, end)),
NoMatch(i) => NoMatch(i),
Quit => Quit,
}
}
#[cfg(feature = "perf-dfa")]
#[cfg_attr(feature = "perf-inline", inline(always))]
fn find_dfa_anchored_reverse(
&self,
text: &[u8],
start: usize,
) -> dfa::Result<(usize, usize)> {
use dfa::Result::*;
match dfa::Fsm::reverse(
&self.ro.dfa_reverse,
self.cache.value(),
false,
&text[start..],
text.len() - start,
) {
Match(s) => Match((start + s, text.len())),
NoMatch(i) => NoMatch(i),
Quit => Quit,
}
}
#[cfg(feature = "perf-dfa")]
#[cfg_attr(feature = "perf-inline", inline(always))]
fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start)
}
#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
#[cfg_attr(feature = "perf-inline", inline(always))]
fn shortest_dfa_reverse_suffix(
&self,
text: &[u8],
start: usize,
) -> dfa::Result<usize> {
match self.exec_dfa_reverse_suffix(text, start) {
None => self.shortest_dfa(text, start),
Some(r) => r.map(|(_, end)| end),
}
}
#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
#[cfg_attr(feature = "perf-inline", inline(always))]
fn exec_dfa_reverse_suffix(
&self,
text: &[u8],
original_start: usize,
) -> Option<dfa::Result<(usize, usize)>> {
use dfa::Result::*;
let lcs = self.ro.suffixes.lcs();
debug_assert!(lcs.len() >= 1);
let mut start = original_start;
let mut end = start;
let mut last_literal = start;
while end <= text.len() {
last_literal += match lcs.find(&text[last_literal..]) {
None => return Some(NoMatch(text.len())),
Some(i) => i,
};
end = last_literal + lcs.len();
match dfa::Fsm::reverse(
&self.ro.dfa_reverse,
self.cache.value(),
false,
&text[start..end],
end - start,
) {
Match(0) | NoMatch(0) => return None,
Match(i) => return Some(Match((start + i, end))),
NoMatch(i) => {
start += i;
last_literal += 1;
continue;
}
Quit => return Some(Quit),
};
}
Some(NoMatch(text.len()))
}
#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
#[cfg_attr(feature = "perf-inline", inline(always))]
fn find_dfa_reverse_suffix(
&self,
text: &[u8],
start: usize,
) -> dfa::Result<(usize, usize)> {
use dfa::Result::*;
let match_start = match self.exec_dfa_reverse_suffix(text, start) {
None => return self.find_dfa_forward(text, start),
Some(Match((start, _))) => start,
Some(r) => return r,
};
match dfa::Fsm::forward(
&self.ro.dfa,
self.cache.value(),
false,
text,
match_start,
) {
NoMatch(_) => panic!("BUG: reverse match implies forward match"),
Quit => Quit,
Match(e) => Match((match_start, e)),
}
}
#[cfg(feature = "perf-dfa")]
fn match_nfa(&self, text: &[u8], start: usize) -> bool {
self.match_nfa_type(MatchNfaType::Auto, text, start)
}
fn match_nfa_type(
&self,
ty: MatchNfaType,
text: &[u8],
start: usize,
) -> bool {
self.exec_nfa(
ty,
&mut [false],
&mut [],
true,
false,
text,
start,
text.len(),
)
}
#[cfg(feature = "perf-dfa")]
fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
self.shortest_nfa_type(MatchNfaType::Auto, text, start)
}
fn shortest_nfa_type(
&self,
ty: MatchNfaType,
text: &[u8],
start: usize,
) -> Option<usize> {
let mut slots = [None, None];
if self.exec_nfa(
ty,
&mut [false],
&mut slots,
true,
true,
text,
start,
text.len(),
) {
slots[1]
} else {
None
}
}
fn find_nfa(
&self,
ty: MatchNfaType,
text: &[u8],
start: usize,
) -> Option<(usize, usize)> {
let mut slots = [None, None];
if self.exec_nfa(
ty,
&mut [false],
&mut slots,
false,
false,
text,
start,
text.len(),
) {
match (slots[0], slots[1]) {
(Some(s), Some(e)) => Some((s, e)),
_ => None,
}
} else {
None
}
}
#[cfg(feature = "perf-dfa")]
fn captures_nfa(
&self,
slots: &mut [Slot],
text: &[u8],
start: usize,
) -> Option<(usize, usize)> {
self.captures_nfa_type(
MatchNfaType::Auto,
slots,
text,
start,
text.len(),
)
}
fn captures_nfa_type(
&self,
ty: MatchNfaType,
slots: &mut [Slot],
text: &[u8],
start: usize,
end: usize,
) -> Option<(usize, usize)> {
if self.exec_nfa(
ty,
&mut [false],
slots,
false,
false,
text,
start,
end,
) {
match (slots[0], slots[1]) {
(Some(s), Some(e)) => Some((s, e)),
_ => None,
}
} else {
None
}
}
fn exec_nfa(
&self,
mut ty: MatchNfaType,
matches: &mut [bool],
slots: &mut [Slot],
quit_after_match: bool,
quit_after_match_with_pos: bool,
text: &[u8],
start: usize,
end: usize,
) -> bool {
use self::MatchNfaType::*;
if let Auto = ty {
if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
ty = Backtrack;
} else {
ty = PikeVM;
}
}
if quit_after_match_with_pos || ty == PikeVM {
self.exec_pikevm(
matches,
slots,
quit_after_match,
text,
start,
end,
)
} else {
self.exec_backtrack(matches, slots, text, start, end)
}
}
fn exec_pikevm(
&self,
matches: &mut [bool],
slots: &mut [Slot],
quit_after_match: bool,
text: &[u8],
start: usize,
end: usize,
) -> bool {
if self.ro.nfa.uses_bytes() {
pikevm::Fsm::exec(
&self.ro.nfa,
self.cache.value(),
matches,
slots,
quit_after_match,
ByteInput::new(text, self.ro.nfa.only_utf8),
start,
end,
)
} else {
pikevm::Fsm::exec(
&self.ro.nfa,
self.cache.value(),
matches,
slots,
quit_after_match,
CharInput::new(text),
start,
end,
)
}
}
fn exec_backtrack(
&self,
matches: &mut [bool],
slots: &mut [Slot],
text: &[u8],
start: usize,
end: usize,
) -> bool {
if self.ro.nfa.uses_bytes() {
backtrack::Bounded::exec(
&self.ro.nfa,
self.cache.value(),
matches,
slots,
ByteInput::new(text, self.ro.nfa.only_utf8),
start,
end,
)
} else {
backtrack::Bounded::exec(
&self.ro.nfa,
self.cache.value(),
matches,
slots,
CharInput::new(text),
start,
end,
)
}
}
pub fn many_matches_at(
&self,
matches: &mut [bool],
text: &[u8],
start: usize,
) -> bool {
use self::MatchType::*;
if !self.is_anchor_end_match(text) {
return false;
}
match self.ro.match_type {
#[cfg(feature = "perf-literal")]
Literal(ty) => {
debug_assert_eq!(matches.len(), 1);
matches[0] = self.find_literals(ty, text, start).is_some();
matches[0]
}
#[cfg(feature = "perf-dfa")]
Dfa | DfaAnchoredReverse | DfaMany => {
match dfa::Fsm::forward_many(
&self.ro.dfa,
self.cache.value(),
matches,
text,
start,
) {
dfa::Result::Match(_) => true,
dfa::Result::NoMatch(_) => false,
dfa::Result::Quit => self.exec_nfa(
MatchNfaType::Auto,
matches,
&mut [],
false,
false,
text,
start,
text.len(),
),
}
}
#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
DfaSuffix => {
match dfa::Fsm::forward_many(
&self.ro.dfa,
self.cache.value(),
matches,
text,
start,
) {
dfa::Result::Match(_) => true,
dfa::Result::NoMatch(_) => false,
dfa::Result::Quit => self.exec_nfa(
MatchNfaType::Auto,
matches,
&mut [],
false,
false,
text,
start,
text.len(),
),
}
}
Nfa(ty) => self.exec_nfa(
ty,
matches,
&mut [],
false,
false,
text,
start,
text.len(),
),
Nothing => false,
}
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn is_anchor_end_match(&self, text: &[u8]) -> bool {
#[cfg(not(feature = "perf-literal"))]
fn imp(_: &ExecReadOnly, _: &[u8]) -> bool {
true
}
#[cfg(feature = "perf-literal")]
fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool {
if text.len() > (1 << 20) && ro.nfa.is_anchored_end {
let lcs = ro.suffixes.lcs();
if lcs.len() >= 1 && !lcs.is_suffix(text) {
return false;
}
}
true
}
imp(&self.ro, text)
}
pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
&self.ro.nfa.capture_name_idx
}
}
impl<'c> ExecNoSyncStr<'c> {
pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
self.0.capture_name_idx()
}
}
impl Exec {
#[cfg_attr(feature = "perf-inline", inline(always))]
pub fn searcher(&self) -> ExecNoSync {
ExecNoSync {
ro: &self.ro,
cache: self.pool.get(),
}
}
#[cfg_attr(feature = "perf-inline", inline(always))]
pub fn searcher_str(&self) -> ExecNoSyncStr {
ExecNoSyncStr(self.searcher())
}
pub fn into_regex(self) -> re_unicode::Regex {
re_unicode::Regex::from(self)
}
pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
re_set::unicode::RegexSet::from(self)
}
pub fn into_byte_regex(self) -> re_bytes::Regex {
re_bytes::Regex::from(self)
}
pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
re_set::bytes::RegexSet::from(self)
}
pub fn regex_strings(&self) -> &[String] {
&self.ro.res
}
pub fn capture_names(&self) -> &[Option<String>] {
&self.ro.nfa.captures
}
pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
&self.ro.nfa.capture_name_idx
}
}
impl Clone for Exec {
fn clone(&self) -> Exec {
let pool = ExecReadOnly::new_pool(&self.ro);
Exec { ro: self.ro.clone(), pool }
}
}
impl ExecReadOnly {
fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
if let Some(MatchType::Nfa(_)) = hint {
return hint.unwrap();
}
if self.nfa.insts.is_empty() {
return MatchType::Nothing;
}
if let Some(literalty) = self.choose_literal_match_type() {
return literalty;
}
if let Some(dfaty) = self.choose_dfa_match_type() {
return dfaty;
}
MatchType::Nfa(MatchNfaType::Auto)
}
fn choose_literal_match_type(&self) -> Option<MatchType> {
#[cfg(not(feature = "perf-literal"))]
fn imp(_: &ExecReadOnly) -> Option<MatchType> {
None
}
#[cfg(feature = "perf-literal")]
fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
if ro.res.len() != 1 {
return None;
}
if ro.ac.is_some() {
return Some(MatchType::Literal(
MatchLiteralType::AhoCorasick,
));
}
if ro.nfa.prefixes.complete() {
return if ro.nfa.is_anchored_start {
Some(MatchType::Literal(MatchLiteralType::AnchoredStart))
} else {
Some(MatchType::Literal(MatchLiteralType::Unanchored))
};
}
if ro.suffixes.complete() {
return if ro.nfa.is_anchored_end {
Some(MatchType::Literal(MatchLiteralType::AnchoredEnd))
} else {
Some(MatchType::Literal(MatchLiteralType::Unanchored))
};
}
None
}
imp(self)
}
fn choose_dfa_match_type(&self) -> Option<MatchType> {
#[cfg(not(feature = "perf-dfa"))]
fn imp(_: &ExecReadOnly) -> Option<MatchType> {
None
}
#[cfg(feature = "perf-dfa")]
fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
if !dfa::can_exec(&ro.dfa) {
return None;
}
if ro.res.len() >= 2 {
return Some(MatchType::DfaMany);
}
if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end {
return Some(MatchType::DfaAnchoredReverse);
}
#[cfg(feature = "perf-literal")]
{
if ro.should_suffix_scan() {
return Some(MatchType::DfaSuffix);
}
}
Some(MatchType::Dfa)
}
imp(self)
}
#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
fn should_suffix_scan(&self) -> bool {
if self.suffixes.is_empty() {
return false;
}
let lcs_len = self.suffixes.lcs().char_len();
lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
}
fn new_pool(ro: &Arc<ExecReadOnly>) -> Pool<ProgramCache> {
let ro = ro.clone();
Pool::new(Box::new(move || {
AssertUnwindSafe(RefCell::new(ProgramCacheInner::new(&ro)))
}))
}
}
#[derive(Clone, Copy, Debug)]
enum MatchType {
#[cfg(feature = "perf-literal")]
Literal(MatchLiteralType),
#[cfg(feature = "perf-dfa")]
Dfa,
#[cfg(feature = "perf-dfa")]
DfaAnchoredReverse,
#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
DfaSuffix,
#[cfg(feature = "perf-dfa")]
DfaMany,
Nfa(MatchNfaType),
Nothing,
}
#[derive(Clone, Copy, Debug)]
#[cfg(feature = "perf-literal")]
enum MatchLiteralType {
Unanchored,
AnchoredStart,
AnchoredEnd,
AhoCorasick,
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
enum MatchNfaType {
Auto,
Backtrack,
PikeVM,
}
pub type ProgramCache = AssertUnwindSafe<RefCell<ProgramCacheInner>>;
#[derive(Debug)]
pub struct ProgramCacheInner {
pub pikevm: pikevm::Cache,
pub backtrack: backtrack::Cache,
#[cfg(feature = "perf-dfa")]
pub dfa: dfa::Cache,
#[cfg(feature = "perf-dfa")]
pub dfa_reverse: dfa::Cache,
}
impl ProgramCacheInner {
fn new(ro: &ExecReadOnly) -> Self {
ProgramCacheInner {
pikevm: pikevm::Cache::new(&ro.nfa),
backtrack: backtrack::Cache::new(&ro.nfa),
#[cfg(feature = "perf-dfa")]
dfa: dfa::Cache::new(&ro.dfa),
#[cfg(feature = "perf-dfa")]
dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
}
}
}
#[cfg(feature = "perf-literal")]
fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
use syntax::hir::{HirKind, Literal};
if !expr.is_alternation_literal() {
return None;
}
let alts = match *expr.kind() {
HirKind::Alternation(ref alts) => alts,
_ => return None,
};
let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit {
Literal::Unicode(c) => {
let mut buf = [0; 4];
dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
}
Literal::Byte(b) => {
dst.push(b);
}
};
let mut lits = vec![];
for alt in alts {
let mut lit = vec![];
match *alt.kind() {
HirKind::Literal(ref x) => extendlit(x, &mut lit),
HirKind::Concat(ref exprs) => {
for e in exprs {
match *e.kind() {
HirKind::Literal(ref x) => extendlit(x, &mut lit),
_ => unreachable!("expected literal, got {:?}", e),
}
}
}
_ => unreachable!("expected literal or concat, got {:?}", alt),
}
lits.push(lit);
}
Some(lits)
}
#[cfg(test)]
mod test {
#[test]
fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
use internal::ExecBuilder;
let backtrack_bytes_re = ExecBuilder::new("^S")
.bounded_backtracking()
.only_utf8(false)
.build()
.map(|exec| exec.into_byte_regex())
.map_err(|err| format!("{}", err))
.unwrap();
let default_bytes_re = ExecBuilder::new("^S")
.only_utf8(false)
.build()
.map(|exec| exec.into_byte_regex())
.map_err(|err| format!("{}", err))
.unwrap();
let input = vec![83, 83];
let s1 = backtrack_bytes_re.split(&input);
let s2 = default_bytes_re.split(&input);
for (chunk1, chunk2) in s1.zip(s2) {
assert_eq!(chunk1, chunk2);
}
}
#[test]
fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
use internal::ExecBuilder;
let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
.bounded_backtracking()
.bytes(true)
.build()
.map(|exec| exec.into_regex())
.map_err(|err| format!("{}", err))
.unwrap();
let default_bytes_re = ExecBuilder::new(r"^(?u:\*)")
.bytes(true)
.build()
.map(|exec| exec.into_regex())
.map_err(|err| format!("{}", err))
.unwrap();
let input = "**";
let s1 = backtrack_bytes_re.split(input);
let s2 = default_bytes_re.split(input);
for (chunk1, chunk2) in s1.zip(s2) {
assert_eq!(chunk1, chunk2);
}
}
}