@@ -133,7 +133,11 @@ function rake(array $tokens, int $ngramSize = 3): \TextAnalysis\Analysis\Keyword
133133 function stem (array $ tokens , string $ stemmerClassName = \TextAnalysis \Stemmers \PorterStemmer::class): array
134134 {
135135 $ stemmer = new $ stemmerClassName ();
136- return array_map (function ($ token ) use ($ stemmer ){ return $ stemmer ->stem ($ token ); }, $ tokens );
136+ foreach ($ tokens as &$ token )
137+ {
138+ $ token = $ stemmer ->stem ($ token );
139+ }
140+ return $ tokens ;
137141 }
138142}
139143
@@ -224,28 +228,32 @@ function naive_bayes() : \TextAnalysis\Classifiers\NaiveBayes
224228}
225229
226230/**
227- * Return an array of filtered tokens
231+ * Pass the tokens in by reference and modify them
228232 * @param array $tokens
229233 * @param string $filterType
230- * @return string[]
231234 */
232- function filter_tokens (array $ tokens , string $ filterType ) : array
235+ function filter_tokens (array & $ tokens , string $ filterType )
233236{
234237 $ className = "\\TextAnalysis \\Filters \\{$ filterType }" ;
235238 $ filter = new $ className ();
236- return array_values ( array_map (function ($ token ) use ($ filter ){ return $ filter ->transform ($ token );}, $ tokens ));
239+ foreach ($ tokens as &$ token )
240+ {
241+ $ token = $ filter ->transform ($ token );
242+ }
237243}
238244
239245/**
240246 * Filter out stop words
241247 * @param array $tokens
242248 * @param array $stopwords
243- * @return array
244249 */
245- function filter_stopwords (array $ tokens , array $ stopwords ) : array
250+ function filter_stopwords (array & $ tokens , array & $ stopwords )
246251{
247- $ filter = new \TextAnalysis \Filters \StopWordsFilter ($ stopwords );
248- return array_values ( array_map (function ($ token ) use ($ filter ){ return $ filter ->transform ($ token );}, $ tokens ));
252+ $ filter = new \TextAnalysis \Filters \StopWordsFilter ($ stopwords );
253+ foreach ($ tokens as &$ token )
254+ {
255+ $ token = $ filter ->transform ($ token );
256+ }
249257}
250258
251259/**
@@ -255,9 +263,89 @@ function filter_stopwords(array $tokens, array $stopwords) : array
255263 */
256264function get_stop_words (string $ filePath ) : array
257265{
258- return array_map ('trim ' , file ($ filePath ));
266+ $ rows = file ($ filePath );
267+ array_walk ($ rows , function (&$ value ){ $ value = trim ($ value ); });
268+ return $ rows ;
259269}
260270
271+ /**
272+ * Return the polarity scores from the vader algorithm
273+ * @param array $tokens
274+ * @return array
275+ */
276+ function vader (array $ tokens ) : array
277+ {
278+ return (new \TextAnalysis \Sentiment \Vader ())->getPolarityScores ($ tokens );
279+ }
280+
281+ /**
282+ * Filter out all null and empty strings
283+ * @param array $tokens
284+ * @return string[]
285+ */
286+ function filter_empty (array $ tokens ) : array
287+ {
288+ foreach ($ tokens as &$ token )
289+ {
290+ if (empty (trim ($ token ))) {
291+ $ token = NULL ;
292+ }
293+ }
294+ return array_filter ($ tokens );
295+ }
296+
297+ function score_keeper_sort ($ a , $ b )
298+ {
299+ if ($ a ->getScore () == $ b ->getScore ()) {
300+ return 0 ;
301+ }
302+ return ($ a ->getScore () < $ b ->getScore ()) ? 1 : -1 ;
303+ }
304+
305+ /**
306+ * Apply common filters and
307+ * @param string $text
308+ * @param array $stopwords
309+ * @return array
310+ */
311+ function summary_simple (string $ text , array $ stopwords = []) : array
312+ {
313+ $ sentenceTokensOriginal = (new \TextAnalysis \Tokenizers \VanderleeTokenizer ())->tokenize (strtolower ($ text ));
314+
315+ //create copy
316+ $ sentenceTokens = $ sentenceTokensOriginal ;
317+ if (!empty ($ stopwords )) {
318+ foreach ($ sentenceTokens as &$ sentence )
319+ {
320+ $ sentence = str_replace ($ stopwords , " " , $ sentence );
321+ }
322+ }
323+
324+ filter_tokens ($ sentenceTokens , 'TrimFilter ' );
325+ filter_tokens ($ sentenceTokens , 'QuotesFilter ' );
326+ filter_tokens ($ sentenceTokens , 'CharFilter ' );
327+
328+ $ wordTokens = tokenize ($ text );
329+ foreach (['LowerCaseFilter ' ,'PunctuationFilter ' ,'QuotesFilter ' ,'PossessiveNounFilter ' ,'CharFilter ' ] as $ filterType )
330+ {
331+ filter_tokens ($ wordTokens , $ filterType );
332+ }
333+
334+ if (!empty ($ stopwords )) {
335+ filter_stopwords ($ wordTokens , $ stopwords );
336+ }
337+
338+ $ summarizer = new \TextAnalysis \Analysis \Summarize \Simple ();
339+ $ scores = $ summarizer ->summarize (filter_empty ( $ wordTokens ), $ sentenceTokens );
340+
341+ // reorder sentences in the best order
342+ $ bestSentences = [];
343+ foreach ($ scores as $ score )
344+ {
345+ $ bestSentences [] = $ sentenceTokensOriginal [$ score ->getIndex ()];
346+ }
347+ return $ bestSentences ;
348+ }
261349
262350
263351
0 commit comments