//************************************************************************ //* * //* File Name: stl_string_regex.cpp * //* Author: Richard Kernan * //* Last Modified: 08/01/01 * //* * //************************************************************************ //* * //* This file contains the function definitions for adding regular * //* expressions to C++ Standard Template Library (STL) string objects * //* and C-style character pointer strings (which can be passed to * //* these methods for implicit conversion). * //* * //************************************************************************ using namespace std; #include "stl_string_regex.h" //************************************************************************ //* * //* METHOD: regx_match( std::string, std::string ) * //* PURPOSE: To provide regular expression searches to Standard * //* Template Library (STL) string objects. * //* ARGUMENTS: "stl_string" is the STL string object that will be * //* searched. * //* "pattern" is the regular expression pattern used in * //* the search. * //* RETURNS: true if there is a regular expression match, * //* false otherwise. * //* NOTES: If there are any errors with the regular expression * //* pattern, this method will throw one of the exceptions * //* listed in the exception specification. * //* Other possible exceptions are those that may be * //* thrown by the STL string or vector classes. * //* * //************************************************************************ bool regx_match (const string & stl_string, const string & pattern) { regex_t regexp; int status = 0; char *err_msg = NULL; // Compile the regular expression pattern if ((status = regcomp (®exp, pattern.c_str (), REG_EXTENDED | REG_NOSUB)) != 0) { if (status == REG_ESPACE) throw bad_alloc (); else { // Get the size of the message. int err_msg_sz = regerror (status, ®exp, NULL, (size_t) 0); // Allocate the memory, print the message, // and then free the memory. if ((err_msg = (char *) malloc (err_msg_sz)) != NULL) { regerror (status, ®exp, err_msg, err_msg_sz); string error = err_msg; free (err_msg); err_msg = NULL; error += "\nRegular expression = \""; error += pattern; error += "\""; throw invalid_argument (error.c_str ()); } else { string invalid = "Invalid regular expression: "; invalid += pattern; throw invalid_argument (invalid.c_str ()); } } } // Search for the regular expression in the string status = regexec (®exp, stl_string.c_str (), (size_t) 0, NULL, 0); // Free the memory allocated by // the regex.h functions. regfree (®exp); if (status == REG_NOMATCH) return false; else if (status != 0) { if (status == REG_ESPACE) throw bad_alloc (); else { // Get the size of the message. int err_msg_sz = regerror (status, ®exp, NULL, (size_t) 0); // Allocate the memory, print the message, // and then free the memory. if ((err_msg = (char *) malloc (err_msg_sz)) != NULL) { regerror (status, ®exp, err_msg, err_msg_sz); string error = err_msg; free (err_msg); err_msg = NULL; error += "\nRegular expression = \""; error += pattern; error += "\""; throw invalid_argument (error.c_str ()); } else { string invalid = "Invalid regular expression: "; invalid += pattern; throw invalid_argument (invalid.c_str ()); } } } return true; } //************************************************************************ //* * //* METHOD: regx_match( std::string, std::string, * //* std::vector<std::string> ) * //* PURPOSE: To provide regular expression searches to Standard * //* Template Library (STL) string objects. * //* ARGUMENTS: "stl_string" is the STL string object that will be * //* searched. * //* "pattern" is the regular expression pattern used in * //* the search. * //* "sub_strings" is an STL vector of STL string objects * //* that represents any substring matches within the * //* regular expression. See NOTES for details. * //* RETURNS: true if there is a regular expression match, * //* false otherwise. * //* NOTES: If there are any errors with the regular expression * //* pattern, this method will throw one of the exceptions * //* listed in the exception specification. * //* Other possible exceptions are those that may be * //* thrown by the STL string or vector classes. * //* * //* Details on "sub_strings": * //* sub_strings[0] = the substring within "stl_string" * //* that is matched by the entire regular expression. * //* !! changed sub_strings[0] matches the before string //* //* Any other strings in "sub_strings" represent * //* substrings within "stl_string" that are matched by * //* parts of the regular expression "pattern" that are * //* enclosed in parentheses. * //* For example (run on Linux): * //* stl_string = "This 234 is jklfjk my 0...9 string" * //* pattern = "([0-9]+) is(.*?)([0-9]+)" * //* sub_strings[0] = "234 is jklfjk my 0...9" * //* sub_strings[1] = "234" * //* sub_strings[2] = " jklfjk my 0..." * //* sub_strings[3] = "9" * //* * //* For those of you who are accustomed to Perl syntax, * //* sub_strings[1] = $1 * //* sub_strings[2] = $2 * //* etc. * //* * //* This will not compile on the IRIX.gl system at UMBC. * //* I believe the compiler is too old to recognize some * //* of the items in these functions. * //* * //************************************************************************ bool regx_match (const string & stl_string, const string & pattern, vector < string > &sub_strings) { regex_t regexp; char *err_msg = NULL; int status = 0; int nsubs = pattern.size () + 1; // Compile the regular expression and check for errors if ((status = regcomp (®exp, pattern.c_str (), REG_EXTENDED)) != 0) { if (status == REG_ESPACE) throw bad_alloc (); else { // Get the size of the message. int err_msg_sz = regerror (status, ®exp, NULL, (size_t) 0); // Allocate the memory, print the message, // and then free the memory. if ((err_msg = (char *) malloc (err_msg_sz)) != NULL) { regerror (status, ®exp, err_msg, err_msg_sz); string error = err_msg; free (err_msg); err_msg = NULL; error += "\nRegular expression = \""; error += pattern; error += "\""; throw invalid_argument (error.c_str ()); } else { string invalid = "Invalid regular expression: "; invalid += pattern; throw invalid_argument (invalid.c_str ()); } } } if (nsubs > 0) { // Allocate space to save the matching substring indices regmatch_t *subidx = new regmatch_t[nsubs * sizeof (regmatch_t)]; if (subidx == NULL) nsubs = 0; else { // Allocate space to save the characters in the // matching substrings. char *substring = new char[stl_string.size () + 1]; if (substring == NULL) nsubs = 0; else { // Perform the search and save the matching substring indices status = regexec (®exp, stl_string.c_str (), (size_t) nsubs, subidx, 0); sub_strings.clear (); // without this a failed match causes a hang Nandor if (status == REG_NOMATCH) return false; int i, j, k; // fill sub_string[0] with the string before the match for (j = 0, k = 0; j < subidx[0].rm_so; j++, k++) substring[k] = stl_string.at (j); substring[k] = '\0'; string str = substring; sub_strings.push_back (str); for (i = 1; i < nsubs; i++) { // Stop the substring saves when there are no more to save if (subidx[i].rm_so == -1 || subidx[i].rm_eo == -1) break; int j = 0; int k = 0; // Save the matching substrings to the allocated array for (j = subidx[i].rm_so, k = 0; j < subidx[i].rm_eo; j++, k++) substring[k] = stl_string.at (j); substring[k] = '\0'; // Save the matching substrings in the // vector of strings string str = substring; sub_strings.push_back (str); } delete[]substring; } delete[]subidx; } } // If the memory allocation failed, perform the search // without saving the substrings. if (nsubs <= 0) status = regexec (®exp, stl_string.c_str (), (size_t) 0, NULL, 0); // Free the memory allocated by // the regex.h functions. regfree (®exp); if (status == REG_NOMATCH) return false; if (status != 0) { if (status == REG_ESPACE) throw bad_alloc (); else { // Get the size of the message. int err_msg_sz = regerror (status, ®exp, NULL, (size_t) 0); // Allocate the memory, print the message, // and then free the memory. if ((err_msg = (char *) malloc (err_msg_sz)) != NULL) { regerror (status, ®exp, err_msg, err_msg_sz); string error = err_msg; free (err_msg); err_msg = NULL; error += "\nRegular expression = \""; error += pattern; error += "\""; throw invalid_argument (error.c_str ()); } else { string invalid = "Invalid regular expression: "; invalid += pattern; throw invalid_argument (invalid.c_str ()); } } } return true; } bool regx_replace (string & ss, const string & pattern, const string replacement) { vector < string > fields; if (regx_match (ss, "(" + pattern + ")(.*)", fields)) { // cout << fields[0] << "|" << fields[1] << "|" << fields[2] << "|\n"; ss = fields[0] + replacement + fields[2]; return fields[1] != replacement; } return false; } void regx_replaceall (string & ss, const string & pattern, const string replacement) { vector < string > fields; string s; while (regx_match (ss, "(" + pattern + ")(.*)", fields)) { s = s + fields[0] + replacement; ss = fields[2]; // cout << s << ss << "\n"; } ss = s + ss; } void regx_split (const string & ss, const string & pattern, vector < string > &fields) { vector < string > fie; string pat = "(" + pattern + ")(.+)"; string s = ss; regx_replace (s, "^" + pattern, ""); regx_replace (s, pattern + "$", ""); while (regx_match (s, pat, fie)) { fields.push_back (fie[0]); s = fie[2]; fie.clear (); } fields.push_back (s); }