All checks were successful
Deploy Smart Search Backend Test / deploy (push) Successful in 1m44s
114 lines
2.7 KiB
Go
114 lines
2.7 KiB
Go
package fileparser
|
|
|
|
import (
|
|
"archive/zip"
|
|
"bytes"
|
|
"encoding/xml"
|
|
"io"
|
|
"net/http"
|
|
"strings"
|
|
|
|
"git.techease.ru/Smart-search/smart-search-back/pkg/errors"
|
|
)
|
|
|
|
func ExtractText(data []byte, _ string) (string, error) {
|
|
if len(data) == 0 {
|
|
return "", nil
|
|
}
|
|
|
|
mimeType := http.DetectContentType(data)
|
|
|
|
switch {
|
|
case strings.HasPrefix(mimeType, "text/"):
|
|
return string(data), nil
|
|
case mimeType == "application/zip" || mimeType == "application/octet-stream":
|
|
if isDocx(data) {
|
|
return extractDocx(data)
|
|
}
|
|
return "", errors.NewBusinessError(errors.UnsupportedFileFormat, "поддерживаются только текстовые файлы (.txt) и документы Word (.docx)")
|
|
default:
|
|
return "", errors.NewBusinessError(errors.UnsupportedFileFormat, "неподдерживаемый формат файла: "+mimeType+", поддерживаются .txt и .docx")
|
|
}
|
|
}
|
|
|
|
func isDocx(data []byte) bool {
|
|
reader, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
|
|
if err != nil {
|
|
return false
|
|
}
|
|
|
|
for _, file := range reader.File {
|
|
if file.Name == "word/document.xml" {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func extractDocx(data []byte) (string, error) {
|
|
reader, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
|
|
if err != nil {
|
|
return "", errors.NewInternalError(errors.FileProcessingError, "не удалось прочитать docx файл", err)
|
|
}
|
|
|
|
var content string
|
|
for _, file := range reader.File {
|
|
if file.Name == "word/document.xml" {
|
|
rc, err := file.Open()
|
|
if err != nil {
|
|
return "", errors.NewInternalError(errors.FileProcessingError, "не удалось открыть содержимое документа", err)
|
|
}
|
|
defer func() { _ = rc.Close() }()
|
|
|
|
xmlData, err := io.ReadAll(rc)
|
|
if err != nil {
|
|
return "", errors.NewInternalError(errors.FileProcessingError, "не удалось прочитать содержимое документа", err)
|
|
}
|
|
|
|
content = extractTextFromXML(xmlData)
|
|
break
|
|
}
|
|
}
|
|
|
|
return content, nil
|
|
}
|
|
|
|
type docxDocument struct {
|
|
XMLName xml.Name `xml:"document"`
|
|
Body docxBody `xml:"body"`
|
|
}
|
|
|
|
type docxBody struct {
|
|
Paragraphs []docxParagraph `xml:"p"`
|
|
}
|
|
|
|
type docxParagraph struct {
|
|
Runs []docxRun `xml:"r"`
|
|
}
|
|
|
|
type docxRun struct {
|
|
Text string `xml:"t"`
|
|
}
|
|
|
|
func extractTextFromXML(data []byte) string {
|
|
var doc docxDocument
|
|
if err := xml.Unmarshal(data, &doc); err != nil {
|
|
return ""
|
|
}
|
|
|
|
var result []string
|
|
for _, p := range doc.Body.Paragraphs {
|
|
var line []string
|
|
for _, r := range p.Runs {
|
|
if r.Text != "" {
|
|
line = append(line, r.Text)
|
|
}
|
|
}
|
|
if len(line) > 0 {
|
|
result = append(result, strings.Join(line, ""))
|
|
}
|
|
}
|
|
|
|
return strings.Join(result, "\n")
|
|
}
|