N-граммы в действии
This commit is contained in:
@@ -8,4 +8,7 @@ main =
|
||||
putStrLn "Введите имя файла:" >>
|
||||
getLine >>= \fileName ->
|
||||
readFile fileName >>= \content ->
|
||||
uprint $ splitText content
|
||||
let sentences = splitText content in
|
||||
uprint (take 10 sentences) >>
|
||||
let dict = buildDictionary sentences in
|
||||
saveDictionary "dict.txt" dict
|
||||
|
||||
@@ -21,6 +21,7 @@ description: Please see the README on GitHub at <https://github.com/gith
|
||||
|
||||
dependencies:
|
||||
- base >= 4.7 && < 5
|
||||
- containers
|
||||
- unescaping-print
|
||||
|
||||
ghc-options:
|
||||
|
||||
@@ -35,6 +35,7 @@ library
|
||||
ghc-options: -Wall -Wcompat -Widentities -Wincomplete-record-updates -Wincomplete-uni-patterns -Wmissing-export-lists -Wmissing-home-modules -Wpartial-fields -Wredundant-constraints
|
||||
build-depends:
|
||||
base >=4.7 && <5
|
||||
, containers
|
||||
, unescaping-print
|
||||
default-language: Haskell2010
|
||||
|
||||
@@ -49,6 +50,7 @@ executable part2-exe
|
||||
ghc-options: -Wall -Wcompat -Widentities -Wincomplete-record-updates -Wincomplete-uni-patterns -Wmissing-export-lists -Wmissing-home-modules -Wpartial-fields -Wredundant-constraints -threaded -rtsopts -with-rtsopts=-N
|
||||
build-depends:
|
||||
base >=4.7 && <5
|
||||
, containers
|
||||
, part2
|
||||
, unescaping-print
|
||||
default-language: Haskell2010
|
||||
@@ -65,6 +67,7 @@ test-suite part2-test
|
||||
ghc-options: -Wall -Wcompat -Widentities -Wincomplete-record-updates -Wincomplete-uni-patterns -Wmissing-export-lists -Wmissing-home-modules -Wpartial-fields -Wredundant-constraints -threaded -rtsopts -with-rtsopts=-N
|
||||
build-depends:
|
||||
base >=4.7 && <5
|
||||
, containers
|
||||
, part2
|
||||
, unescaping-print
|
||||
default-language: Haskell2010
|
||||
|
||||
@@ -1,6 +1,11 @@
|
||||
module Lib where
|
||||
|
||||
import Data.Char (isLetter, toLower)
|
||||
import Data.Map (Map)
|
||||
import qualified Data.Map as Map
|
||||
import Data.List (nub, tails)
|
||||
import System.IO
|
||||
import UnescapingPrint (ushow)
|
||||
|
||||
|
||||
splitText :: String -> [[String]]
|
||||
@@ -22,4 +27,18 @@ splitText text = filter (not . null) $ map (processSentence . words) (splitSente
|
||||
processSentence = filter (not . null) . map cleanWord
|
||||
|
||||
cleanWord :: String -> String
|
||||
cleanWord = map toLower . filter isLetter
|
||||
cleanWord = map toLower . filter isLetter
|
||||
|
||||
buildDictionary :: [[String]] -> Map String [String]
|
||||
buildDictionary sentences =
|
||||
let bigrams = [ (w1, w2) | s <- sentences, (w1:w2:_) <- tails s ]
|
||||
trigrams = [ (w1, w2, w3) | s <- sentences, (w1:w2:w3:_) <- tails s ]
|
||||
singleKeys = foldr (\(w1, w2) acc -> Map.insertWith (++) w1 [w2] acc) Map.empty bigrams
|
||||
singleKeys' = foldr (\(w1, w2, w3) acc -> Map.insertWith (++) w1 [w2 ++ " " ++ w3] acc) singleKeys trigrams
|
||||
doubleKeys = foldr (\(w1, w2, w3) acc -> Map.insertWith (++) (w1 ++ " " ++ w2) [w3] acc) Map.empty trigrams
|
||||
combined = Map.unionWith (++) singleKeys' doubleKeys
|
||||
in Map.map nub combined
|
||||
|
||||
saveDictionary :: FilePath -> Map String [String] -> IO ()
|
||||
saveDictionary filePath dict = withFile filePath WriteMode $ \h ->
|
||||
mapM_ (\(k,v) -> hPutStrLn h $ ushow k ++ ": " ++ ushow v) (Map.toList dict)
|
||||
Reference in New Issue
Block a user