8. Regular Expressions

Created Wednesday 23 April 2014

STRING =~ REGEX
STRING !~ REGEX

foreach (@words) {
if (/cat/) {
print "$_\n";
}
}
foreach my $word (@words) {
if ($word !~ /cat/ ) {
print "$word\n";
}
}
# Without the binding operator, use negation like normal:
foreach (@words) {
if (!/cat/) {
print "$_\n";
}
}

Quantifiers

Escape Sequences

Examples:
print "Phone: 123-456-7890" =~ /\b\d{3}-\d{3}-\d{4}\b/ ? "Yes" : "No";
my @strings = qw(
abba
abacus
abbba
babble
Barbarella
Yello
);
my @regexes = (
qr/ab?/,
qr/ab*/,
qr/ab+/,
);
foreach my $string (@strings) {
foreach my $regex (@regexes) {
if ($string =~ $regex) {
print "'$regex' matches '$string'\n";
}
}
}

Extracting Data

if ("Phone: 123-456-7890" =~ /(\b\d{3}-\d{3}-\d{4}\b)/) {
my $phone = $1;
print "The phone number is $phone\n";
}
use strict;
use warnings;
use diagnostics;
use Data::Dumper;
my $text = <<'END';
Name: Alice Allison Age: 23
Occupation: Spy
Name: Bob Barkley Age: 45
Occupation: Fry Cook
Name: Carol Carson Age: 44
Occupation: Manager
Name: Prince Age: 53
Occupation: World Class Musician
END
my %age_for;
foreach my $line (split /\n/, $text) {
if ($line =~ /Name: \s+(.*?)\s+Age:\s+(\d+)/) {
$age_for{$1} = $2;
}
}
print Dumper(\%age_for);
my $name_and_age = qr{
Name:
\s+ # 1 or more whitespace
(.*?) # The name in $1
\s+ # 1 or more whitespace
Age:
\s+ # 1 or more whitespace
(\d+) # The age in $2
}x;
Find double words:
print "Four score score and seven years ago" =~ /\b(\w+)\s+\1\b/ ? "The word ($1) was doubled" : "No doubles";

Modifiers and Anchors

my $name_and_age = qr{
Name:
\s+ # 1 or more whitespace
(.*?) # The name in $1
\s+ # 1 or more whitespace
Age:
\s+ # 1 or more whitespace
(\d+) # The age in $2
};
print $name_and_age;
This will pring:
(?ix-sm:
Name:
\s+ # 1 or more whitespace
(.*?) # The name in $1
\s+ # 1 or more whitespace
Age:
\s+ # 1 or more whitespace
(\d+) # The age in $2
)

Print every non-number in string:
my $string = '';
while ("a1b2c3dddd444eee66" =~ /(\D+)/g) {

$string .= $1;
}
print $string;
Count every occurrence of a word ending in the letters at
my $silly = 'The fat cat sat on the mat';
my $at_words = 0;
$at_words++ while $silly =~ /\b\w+at/g;

my $match = "aa";
if ($some_string =~ /$match/) {
# match words containing aa
}

Charecter Classes

Grouping

Example:
my %age_for;
while ($text =~ m<Name: \s+([[:alpha:] ]+?)\s+Age:\s+(\d+)>g) {

$age_for{$1} = $2;
}
# The m<...> is just the form of delimiting a regex
# The [[:alpha:] ] matches alphabetic chars plus a space char

ADVANCED MATCHING

Substitutions

s/regular expression/replacement text/
Example:
my $main_course = "A well-done filet mignon";
$main_course =~ s/well-done/rare/;
print $main_course; # prints "A rare filet mignon"

Stupid technique to remove all doubled words from a text:
my $text = "a a b b c cat dddd";
$text =~ s/\b(\w+)\s+\1\b/$1/g;
print $text;

Lookahead/Lookbehind Anchors

Named Subexpressions (5.10)

use v5.10;
my $text = "a a b b c cat dddd";
$text =~ s/\b(?<word>\w+)\s+\g{word}\b/$+{word}/g;
print $text;

COMMON REGEX ISSUES

Regex::Common

Match a real number:
use Regexp::Common;
print "yes" if '-3e17' =~ $RE{num}{real};
Blank out profanity:
use Regexp::Common;
my $text = 'something awful or amusing';
$text =~ s/($RE{profanity})/'*' x length($1)/eg;
print $text;

E-mail Addresses

use Email::Valid;
print (Email::Valid->address($maybe_email) ? 'yes' : 'no');

HTML

Composing Regexes

my $depts = join '|' =>qw(AC IT MG JA);
my $dept_re = qr/$depts/;
my $grade_re = qr/[01]\d|20/;
my $emp_number_re = qr/\d{5,6}/;
if (/\b($dept_re)-($grade_re)-($emp_number_re\b/) {
my $dept = $1;
my $grade = $2;
my $emp_number = $3;
...
}

SUMMARY



Backlinks: