N-граммы в действии

This commit is contained in:
2024-12-09 16:48:54 +03:00
parent 5700d5892a
commit 93bb0f247c
4 changed files with 28 additions and 2 deletions

View File

@@ -8,4 +8,7 @@ main =
putStrLn "Введите имя файла:" >> putStrLn "Введите имя файла:" >>
getLine >>= \fileName -> getLine >>= \fileName ->
readFile fileName >>= \content -> readFile fileName >>= \content ->
uprint $ splitText content let sentences = splitText content in
uprint (take 10 sentences) >>
let dict = buildDictionary sentences in
saveDictionary "dict.txt" dict

View File

@@ -21,6 +21,7 @@ description: Please see the README on GitHub at <https://github.com/gith
dependencies: dependencies:
- base >= 4.7 && < 5 - base >= 4.7 && < 5
- containers
- unescaping-print - unescaping-print
ghc-options: ghc-options:

View File

@@ -35,6 +35,7 @@ library
ghc-options: -Wall -Wcompat -Widentities -Wincomplete-record-updates -Wincomplete-uni-patterns -Wmissing-export-lists -Wmissing-home-modules -Wpartial-fields -Wredundant-constraints ghc-options: -Wall -Wcompat -Widentities -Wincomplete-record-updates -Wincomplete-uni-patterns -Wmissing-export-lists -Wmissing-home-modules -Wpartial-fields -Wredundant-constraints
build-depends: build-depends:
base >=4.7 && <5 base >=4.7 && <5
, containers
, unescaping-print , unescaping-print
default-language: Haskell2010 default-language: Haskell2010
@@ -49,6 +50,7 @@ executable part2-exe
ghc-options: -Wall -Wcompat -Widentities -Wincomplete-record-updates -Wincomplete-uni-patterns -Wmissing-export-lists -Wmissing-home-modules -Wpartial-fields -Wredundant-constraints -threaded -rtsopts -with-rtsopts=-N ghc-options: -Wall -Wcompat -Widentities -Wincomplete-record-updates -Wincomplete-uni-patterns -Wmissing-export-lists -Wmissing-home-modules -Wpartial-fields -Wredundant-constraints -threaded -rtsopts -with-rtsopts=-N
build-depends: build-depends:
base >=4.7 && <5 base >=4.7 && <5
, containers
, part2 , part2
, unescaping-print , unescaping-print
default-language: Haskell2010 default-language: Haskell2010
@@ -65,6 +67,7 @@ test-suite part2-test
ghc-options: -Wall -Wcompat -Widentities -Wincomplete-record-updates -Wincomplete-uni-patterns -Wmissing-export-lists -Wmissing-home-modules -Wpartial-fields -Wredundant-constraints -threaded -rtsopts -with-rtsopts=-N ghc-options: -Wall -Wcompat -Widentities -Wincomplete-record-updates -Wincomplete-uni-patterns -Wmissing-export-lists -Wmissing-home-modules -Wpartial-fields -Wredundant-constraints -threaded -rtsopts -with-rtsopts=-N
build-depends: build-depends:
base >=4.7 && <5 base >=4.7 && <5
, containers
, part2 , part2
, unescaping-print , unescaping-print
default-language: Haskell2010 default-language: Haskell2010

View File

@@ -1,6 +1,11 @@
module Lib where module Lib where
import Data.Char (isLetter, toLower) import Data.Char (isLetter, toLower)
import Data.Map (Map)
import qualified Data.Map as Map
import Data.List (nub, tails)
import System.IO
import UnescapingPrint (ushow)
splitText :: String -> [[String]] splitText :: String -> [[String]]
@@ -22,4 +27,18 @@ splitText text = filter (not . null) $ map (processSentence . words) (splitSente
processSentence = filter (not . null) . map cleanWord processSentence = filter (not . null) . map cleanWord
cleanWord :: String -> String cleanWord :: String -> String
cleanWord = map toLower . filter isLetter cleanWord = map toLower . filter isLetter
buildDictionary :: [[String]] -> Map String [String]
buildDictionary sentences =
let bigrams = [ (w1, w2) | s <- sentences, (w1:w2:_) <- tails s ]
trigrams = [ (w1, w2, w3) | s <- sentences, (w1:w2:w3:_) <- tails s ]
singleKeys = foldr (\(w1, w2) acc -> Map.insertWith (++) w1 [w2] acc) Map.empty bigrams
singleKeys' = foldr (\(w1, w2, w3) acc -> Map.insertWith (++) w1 [w2 ++ " " ++ w3] acc) singleKeys trigrams
doubleKeys = foldr (\(w1, w2, w3) acc -> Map.insertWith (++) (w1 ++ " " ++ w2) [w3] acc) Map.empty trigrams
combined = Map.unionWith (++) singleKeys' doubleKeys
in Map.map nub combined
saveDictionary :: FilePath -> Map String [String] -> IO ()
saveDictionary filePath dict = withFile filePath WriteMode $ \h ->
mapM_ (\(k,v) -> hPutStrLn h $ ushow k ++ ": " ++ ushow v) (Map.toList dict)